This document shows my R codes that I used to clean and analyze Hooli’s employee survey data. The document is structured as follows:
library(readxl)
df_original <- read_xlsx("data/sr._people_science_analyst_assignment_dataset_2024.xlsx")
## Warning: Expecting numeric in Q1601 / R1601C17: got 'N/A'
## Warning: Expecting numeric in N2047 / R2047C14: got 'N/A'
## Warning: Expecting numeric in I2380 / R2380C9: got 'N/A'
## Warning: Expecting numeric in E2515 / R2515C5: got 'N/A'
‘lea_3’ should be a numeric variable. After checking the data, employee ID M01562’s response is only missing for’lea_3’. Therefore, this ‘N/A’ should be recoded as -1.
‘hiredate’ should be a date variable. It is also important to note that the current format is written differently based on the country that an employee is from (it’s always important to check the entire dataset!!). Australia: Day/Month/Year Denmark: Day/Month/Year France: Day/Month/Year Germany: Day/Month/Year India: Day/Month/Year UK: Day/Month/Year United Kingdom: Day/Month/Year United States: Month/Day/Year USA: Month/Day/Year Canada: Month/Day/Year China: Year/Month/Day So in converting this variable to date-type, I need to specify this format in my code.
#checking the macro-level trends of the dataset
head(df_original, 5)
## # A tibble: 5 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01434 5 5 4 5 3 1 5 5 5 5 5 5
## 2 M00631 4 4 4 4 4 4 4 4 4 3 3 4
## 3 M00325 5 5 4 5 4 5 5 5 5 4 3 5
## 4 M00805 5 5 5 5 5 5 5 5 5 5 5 5
## 5 M00157 5 5 4 5 5 5 5 5 5 4 3 5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <chr>, lea_4 <dbl>, age <chr>,
## # hiredate <chr>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
tail(df_original, 5)
## # A tibble: 5 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M02004 4 4 5 4 3 4 5 4 4 4 5 4
## 2 M02642 4 4 4 4 4 4 4 4 4 4 3 4
## 3 M01255 3 3 1 3 4 3 4 2 3 2 2 5
## 4 M01626 5 5 5 4 4 3 5 5 5 5 5 5
## 5 M02432 4 2 1 3 3 5 5 5 4 4 5 5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <chr>, lea_4 <dbl>, age <chr>,
## # hiredate <chr>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
str(df_original)
## tibble [2,651 × 27] (S3: tbl_df/tbl/data.frame)
## $ eeid : chr [1:2651] "M01434" "M00631" "M00325" "M00805" ...
## $ ali_1 : num [1:2651] 5 4 5 5 5 4 4 5 5 5 ...
## $ ali_2 : num [1:2651] 5 4 5 5 5 4 3 5 5 5 ...
## $ ali_3 : num [1:2651] 4 4 4 5 4 3 4 4 4 5 ...
## $ col_1 : num [1:2651] 5 4 5 5 5 4 3 4 5 5 ...
## $ col_2 : num [1:2651] 3 4 4 5 5 4 3 5 4 4 ...
## $ col_3 : num [1:2651] 1 4 5 5 5 4 3 5 4 5 ...
## $ eng_1 : num [1:2651] 5 4 5 5 5 4 4 5 5 5 ...
## $ eng_2 : num [1:2651] 5 4 5 5 5 4 3 4 5 5 ...
## $ eng_3 : num [1:2651] 5 4 5 5 5 3 4 5 5 4 ...
## $ eng_4 : num [1:2651] 5 3 4 5 4 4 3 4 5 5 ...
## $ eng_5 : num [1:2651] 5 3 3 5 3 4 3 5 5 3 ...
## $ inc_1 : num [1:2651] 5 4 5 5 5 4 4 5 5 5 ...
## $ inc_2 : num [1:2651] 5 4 5 5 5 3 4 4 4 4 ...
## $ inc_3 : num [1:2651] 5 4 5 5 5 2 4 4 5 5 ...
## $ inc_4 : num [1:2651] 5 4 5 4 4 4 4 5 5 4 ...
## $ inc_5 : num [1:2651] 3 3 5 3 5 4 4 5 5 5 ...
## $ lea_1 : num [1:2651] 4 4 5 5 5 2 5 5 5 3 ...
## $ lea_2 : num [1:2651] 5 5 5 5 5 4 5 5 5 5 ...
## $ lea_3 : chr [1:2651] "5" "5" "5" "5" ...
## $ lea_4 : num [1:2651] 5 5 5 5 5 4 5 5 5 5 ...
## $ age : chr [1:2651] "35-44" "45-54" "18-24" "45-54" ...
## $ hiredate : chr [1:2651] "25/02/2024" "02/10/2022" "20/10/2023" "08/12/2018" ...
## $ race : chr [1:2651] NA NA NA "White" ...
## $ gender : chr [1:2651] "Male" "Male" "Male" "Male" ...
## $ manager_status: chr [1:2651] "Non-Manager" "Non-Manager" "Non-Manager" "Manager" ...
## $ country : chr [1:2651] "France" "France" "United Kingdom" "United States" ...
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df_original, main = 'missing data map for the Hooli Survey Data', col = c('purple', 'black'), legend = TRUE)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
#except for race, most values are not missing!
#Creating a new dataset that I will prepare and process for analysis
df <- df_original
#Fixing errors in 'lea_3'
table(df_original$lea_3)
##
## -1 1 2 3 4 5 99 N/A
## 12 22 58 197 1147 1213 1 1
df[df$lea_3 == "N/A", ]
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01562 4 3 4 4 4 5 4 5 4 4 3 5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <chr>, lea_4 <dbl>, age <chr>,
## # hiredate <chr>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df$lea_3[df$lea_3 == "N/A"] <- -1
table(df$lea_3)
##
## -1 1 2 3 4 5 99
## 13 22 58 197 1147 1213 1
#transforming 'lea_3' to a numeric variable
df$lea_3 <- as.numeric(df$lea_3)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- df %>%
mutate(hiredate = case_when(
country %in% c("Australia", "Denmark", "France", "Germany", "India", "UK", "United Kingdom") ~ as.Date(hiredate, format = "%d/%m/%Y"),
country %in% c("United States", "USA", "Canada") ~ as.Date(hiredate, format = "%m/%d/%Y"),
country == "China" ~ as.Date(hiredate, format = "%Y/%m/%d"),
TRUE ~ as.Date(NA)
))
library(summarytools)
dfSummary(df$hiredate)
## df$hiredate was converted to a data frame
## Data Frame Summary
## df
## Dimensions: 2651 x 1
## Duplicates: 1215
##
## -----------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ---------- ------------------- ---------------------- --------------------- ---------- ---------
## 1 hiredate min : 2004-12-24 1436 distinct values : 2651 0
## [Date] med : 2022-06-30 : (100.0%) (0.0%)
## max : 2024-02-29 :
## range : 19y 2m 5d : :
## . . : : :
## -----------------------------------------------------------------------------------------------------
I identified several outliers, value 99, from these variables: ali_1, ali_2, ali_3, col_1, col_2, col_3, and lea_4. The value 99 is a value that is out of the scale of -1 to 5. I discovered that all these responses come from Denmark. Thus, it was probably a systematic coding error for these responses from Denmark. Given that these are psychological measures, it doesn’t make sense code 99 might denote an exceptional response out of -1 to 5 scale. Thus, I deem 99 as a coding error. Additionally, these employees are less than 1% of the participants in the data and that they completed all the other questions in the survey. Therefore, I recode 99 to -1, the code for missing value.
library(summarytools)
dfSummary(df)
## Data Frame Summary
## df
## Dimensions: 2651 x 27
## Duplicates: 0
##
## ----------------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ---------------- ------------------------------ ---------------------- --------------------- ---------- ---------
## 1 eeid 1. M00001 1 ( 0.0%) 2651 0
## [character] 2. M00002 1 ( 0.0%) (100.0%) (0.0%)
## 3. M00003 1 ( 0.0%)
## 4. M00004 1 ( 0.0%)
## 5. M00005 1 ( 0.0%)
## 6. M00006 1 ( 0.0%)
## 7. M00007 1 ( 0.0%)
## 8. M00008 1 ( 0.0%)
## 9. M00009 1 ( 0.0%)
## 10. M00010 1 ( 0.0%)
## [ 2641 others ] 2641 (99.6%) IIIIIIIIIIIIIIIIIII
##
## 2 ali_1 Mean (sd) : 4.3 (2.8) -1 : 16 ( 0.6%) 2651 0
## [numeric] min < med < max: 1 : 27 ( 1.0%) (100.0%) (0.0%)
## -1 < 4 < 99 2 : 64 ( 2.4%)
## IQR (CV) : 1 (0.6) 3 : 208 ( 7.8%) I
## 4 : 1124 (42.4%) IIIIIIII
## 5 : 1210 (45.6%) IIIIIIIII
## 99 : 2 ( 0.1%)
##
## 3 ali_2 Mean (sd) : 4.2 (2.1) -1 : 23 ( 0.9%) 2651 0
## [numeric] min < med < max: 1 : 44 ( 1.7%) (100.0%) (0.0%)
## -1 < 4 < 99 2 : 86 ( 3.2%)
## IQR (CV) : 1 (0.5) 3 : 310 (11.7%) II
## 4 : 1088 (41.0%) IIIIIIII
## 5 : 1099 (41.5%) IIIIIIII
## 99 : 1 ( 0.0%)
##
## 4 ali_3 Mean (sd) : 3.8 (2.2) -1 : 23 ( 0.9%) 2651 0
## [numeric] min < med < max: 1 : 101 ( 3.8%) (100.0%) (0.0%)
## -1 < 4 < 99 2 : 232 ( 8.8%) I
## IQR (CV) : 2 (0.6) 3 : 496 (18.7%) III
## 4 : 1086 (41.0%) IIIIIIII
## 5 : 712 (26.9%) IIIII
## 99 : 1 ( 0.0%)
##
## 5 col_1 Mean (sd) : 4.2 (2.8) -1 : 10 ( 0.4%) 2650 1
## [numeric] min < med < max: 1 : 33 ( 1.2%) (100.0%) (0.0%)
## -1 < 4 < 99 2 : 114 ( 4.3%)
## IQR (CV) : 1 (0.7) 3 : 318 (12.0%) II
## 4 : 1135 (42.8%) IIIIIIII
## 5 : 1038 (39.2%) IIIIIII
## 99 : 2 ( 0.1%)
##
## 6 col_2 Mean (sd) : 3.7 (2.2) -1 : 76 ( 2.9%) 2651 0
## [numeric] min < med < max: 1 : 43 ( 1.6%) (100.0%) (0.0%)
## -1 < 4 < 99 2 : 130 ( 4.9%)
## IQR (CV) : 1 (0.6) 3 : 853 (32.2%) IIIIII
## 4 : 900 (33.9%) IIIIII
## 5 : 648 (24.4%) IIII
## 99 : 1 ( 0.0%)
##
## 7 col_3 Mean (sd) : 4.1 (2.1) -1 : 24 ( 0.9%) 2651 0
## [numeric] min < med < max: 1 : 55 ( 2.1%) (100.0%) (0.0%)
## -1 < 4 < 99 2 : 128 ( 4.8%)
## IQR (CV) : 1 (0.5) 3 : 327 (12.3%) II
## 4 : 1037 (39.1%) IIIIIII
## 5 : 1079 (40.7%) IIIIIIII
## 99 : 1 ( 0.0%)
##
## 8 eng_1 Mean (sd) : 4.5 (0.7) -1 : 6 ( 0.2%) 2651 0
## [numeric] min < med < max: 1 : 5 ( 0.2%) (100.0%) (0.0%)
## -1 < 5 < 5 2 : 23 ( 0.9%)
## IQR (CV) : 1 (0.2) 3 : 181 ( 6.8%) I
## 4 : 869 (32.8%) IIIIII
## 5 : 1567 (59.1%) IIIIIIIIIII
##
## 9 eng_2 Mean (sd) : 4.3 (0.9) -1 : 4 ( 0.2%) 2650 1
## [numeric] min < med < max: 1 : 48 ( 1.8%) (100.0%) (0.0%)
## -1 < 5 < 5 2 : 104 ( 3.9%)
## IQR (CV) : 1 (0.2) 3 : 243 ( 9.2%) I
## 4 : 827 (31.2%) IIIIII
## 5 : 1424 (53.7%) IIIIIIIIII
##
## 10 eng_3 Mean (sd) : 4.1 (1) -1 : 12 ( 0.5%) 2651 0
## [numeric] min < med < max: 1 : 51 ( 1.9%) (100.0%) (0.0%)
## -1 < 4 < 5 2 : 95 ( 3.6%)
## IQR (CV) : 1 (0.2) 3 : 379 (14.3%) II
## 4 : 995 (37.5%) IIIIIII
## 5 : 1119 (42.2%) IIIIIIII
##
## 11 eng_4 Mean (sd) : 3.9 (1.1) -1 : 8 ( 0.3%) 2651 0
## [numeric] min < med < max: 1 : 60 ( 2.3%) (100.0%) (0.0%)
## -1 < 4 < 5 2 : 249 ( 9.4%) I
## IQR (CV) : 2 (0.3) 3 : 523 (19.7%) III
## 4 : 950 (35.8%) IIIIIII
## 5 : 861 (32.5%) IIIIII
##
## 12 eng_5 Mean (sd) : 4 (1) -1 : 8 ( 0.3%) 2651 0
## [numeric] min < med < max: 1 : 55 ( 2.1%) (100.0%) (0.0%)
## -1 < 4 < 5 2 : 163 ( 6.1%) I
## IQR (CV) : 2 (0.3) 3 : 516 (19.5%) III
## 4 : 896 (33.8%) IIIIII
## 5 : 1013 (38.2%) IIIIIII
##
## 13 inc_1 Mean (sd) : 4.4 (0.9) -1 : 19 ( 0.7%) 2651 0
## [numeric] min < med < max: 1 : 26 ( 1.0%) (100.0%) (0.0%)
## -1 < 5 < 5 2 : 52 ( 2.0%)
## IQR (CV) : 1 (0.2) 3 : 196 ( 7.4%) I
## 4 : 828 (31.2%) IIIIII
## 5 : 1530 (57.7%) IIIIIIIIIII
##
## 14 inc_2 Mean (sd) : 3.9 (1.2) -1 : 78 ( 2.9%) 2650 1
## [numeric] min < med < max: 1 : 30 ( 1.1%) (100.0%) (0.0%)
## -1 < 4 < 5 2 : 103 ( 3.9%)
## IQR (CV) : 2 (0.3) 3 : 575 (21.7%) IIII
## 4 : 908 (34.3%) IIIIII
## 5 : 956 (36.1%) IIIIIII
##
## 15 inc_3 Mean (sd) : 4.1 (1.1) -1 : 50 ( 1.9%) 2651 0
## [numeric] min < med < max: 1 : 28 ( 1.1%) (100.0%) (0.0%)
## -1 < 4 < 5 2 : 91 ( 3.4%)
## IQR (CV) : 1 (0.3) 3 : 370 (14.0%) II
## 4 : 916 (34.6%) IIIIII
## 5 : 1196 (45.1%) IIIIIIIII
##
## 16 inc_4 Mean (sd) : 3.9 (1.2) -1 : 64 ( 2.4%) 2651 0
## [numeric] min < med < max: 1 : 33 ( 1.2%) (100.0%) (0.0%)
## -1 < 4 < 5 2 : 110 ( 4.1%)
## IQR (CV) : 2 (0.3) 3 : 634 (23.9%) IIII
## 4 : 895 (33.8%) IIIIII
## 5 : 915 (34.5%) IIIIII
##
## 17 inc_5 Mean (sd) : 4 (1.2) -1 : 46 ( 1.7%) 2650 1
## [numeric] min < med < max: 1 : 71 ( 2.7%) (100.0%) (0.0%)
## -1 < 4 < 5 2 : 125 ( 4.7%)
## IQR (CV) : 1 (0.3) 3 : 390 (14.7%) II
## 4 : 1018 (38.4%) IIIIIII
## 5 : 1000 (37.7%) IIIIIII
##
## 18 lea_1 Mean (sd) : 4.2 (1) -1 : 20 ( 0.8%) 2651 0
## [numeric] min < med < max: 1 : 24 ( 0.9%) (100.0%) (0.0%)
## -1 < 4 < 5 2 : 79 ( 3.0%)
## IQR (CV) : 1 (0.2) 3 : 335 (12.6%) II
## 4 : 1074 (40.5%) IIIIIIII
## 5 : 1119 (42.2%) IIIIIIII
##
## 19 lea_2 Mean (sd) : 4.2 (1) -1 : 19 ( 0.7%) 2651 0
## [numeric] min < med < max: 1 : 35 ( 1.3%) (100.0%) (0.0%)
## -1 < 4 < 5 2 : 75 ( 2.8%)
## IQR (CV) : 1 (0.2) 3 : 287 (10.8%) II
## 4 : 979 (36.9%) IIIIIII
## 5 : 1256 (47.4%) IIIIIIIII
##
## 20 lea_3 Mean (sd) : 4.3 (2) -1 : 13 ( 0.5%) 2651 0
## [numeric] min < med < max: 1 : 22 ( 0.8%) (100.0%) (0.0%)
## -1 < 4 < 99 2 : 58 ( 2.2%)
## IQR (CV) : 1 (0.5) 3 : 197 ( 7.4%) I
## 4 : 1147 (43.3%) IIIIIIII
## 5 : 1213 (45.8%) IIIIIIIII
## 99 : 1 ( 0.0%)
##
## 21 lea_4 Mean (sd) : 4.4 (2) -1 : 16 ( 0.6%) 2651 0
## [numeric] min < med < max: 1 : 15 ( 0.6%) (100.0%) (0.0%)
## -1 < 5 < 99 2 : 41 ( 1.5%)
## IQR (CV) : 1 (0.5) 3 : 267 (10.1%) II
## 4 : 930 (35.1%) IIIIIII
## 5 : 1381 (52.1%) IIIIIIIIII
## 99 : 1 ( 0.0%)
##
## 22 age 1. 25-34 1266 (47.8%) IIIIIIIII 2651 0
## [character] 2. 35-44 795 (30.0%) IIIII (100.0%) (0.0%)
## 3. 45-54 301 (11.4%) II
## 4. 18-24 147 ( 5.5%) I
## 5. 55-64 78 ( 2.9%)
## 6. 65+ 54 ( 2.0%)
## 7. 26 2 ( 0.1%)
## 8. N/A 2 ( 0.1%)
## 9. 19 1 ( 0.0%)
## 10. 21 1 ( 0.0%)
## [ 4 others ] 4 ( 0.2%)
##
## 23 hiredate min : 2004-12-24 1436 distinct values : 2651 0
## [Date] med : 2022-06-30 : (100.0%) (0.0%)
## max : 2024-02-29 :
## range : 19y 2m 5d : :
## . . : : :
##
## 24 race 1. American Indian/Alaskan N 2 ( 0.1%) 1584 1067
## [character] 2. Asian 359 (22.7%) IIII (59.8%) (40.2%)
## 3. Black or African American 37 ( 2.3%)
## 4. Black or African American 11 ( 0.7%)
## 5. Hispanic or Latino 80 ( 5.1%) I
## 6. Native Hawaiian or Other 2 ( 0.1%)
## 7. Two or More Races 43 ( 2.7%)
## 8. White 1050 (66.3%) IIIIIIIIIIIII
##
## 25 gender 1. Female 816 (30.8%) IIIIII 2651 0
## [character] 2. Male 1826 (68.9%) IIIIIIIIIIIII (100.0%) (0.0%)
## 3. Man 4 ( 0.2%)
## 4. Unknown 1 ( 0.0%)
## 5. Woman 4 ( 0.2%)
##
## 26 manager_status 1. Manager 654 (24.7%) IIII 2651 0
## [character] 2. Non-Manager 1997 (75.3%) IIIIIIIIIIIIIII (100.0%) (0.0%)
##
## 27 country 1. Australia 101 ( 3.8%) 2651 0
## [character] 2. Canada 84 ( 3.2%) (100.0%) (0.0%)
## 3. China 60 ( 2.3%)
## 4. Denmark 24 ( 0.9%)
## 5. France 136 ( 5.1%) I
## 6. Germany 48 ( 1.8%)
## 7. India 288 (10.9%) II
## 8. UK 11 ( 0.4%)
## 9. United Kingdom 348 (13.1%) II
## 10. United States 1520 (57.3%) IIIIIIIIIII
## 11. USA 31 ( 1.2%)
## ----------------------------------------------------------------------------------------------------------------------
#examine the responses of the employees who responded '99' to one of these questions.
df[df$ali_1 == 99, ] # other responses are in the normal scale range & 2 participants = M00238 & M00568
## # A tibble: 2 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M00238 99 4 4 4 4 4 5 3 4 3 3 5
## 2 M00568 99 5 5 5 3 5 5 5 5 5 5 5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[df$ali_2 == 99, ] # except for ali_2 & lea_4, all the other responses are in the normal scale range & 1 participant = M01455
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01455 5 99 3 5 4 2 5 5 5 4 4 5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[df$ali_3 == 99, ] # other responses are in the normal scale range & 1 participant = M02319
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M02319 4 4 99 4 3 4 4 5 4 4 4 5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[df$col_1 == 99, ] # other responses are in the normal scale range & 2 participants = M01339, M01843
## # A tibble: 3 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01339 5 4 3 99 2 3 5 4 5 4 4 5
## 2 M01843 4 3 4 99 4 2 4 3 3 5 4 4
## 3 <NA> NA NA NA NA NA NA NA NA NA NA NA NA
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[df$col_2 == 99, ] # other responses are in the normal scale range & 1 participant = M00168
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M00168 4 4 4 2 99 3 5 4 5 4 4 4
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[df$col_3 == 99, ] # other responses are in the normal scale range & 1 participant = M01393
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01393 3 3 3 4 3 99 5 5 5 5 5 3
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[df$lea_3 == 99, ] # except fo lea_3, all the other responses are in the normal scale range & 1 participant = M01484
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01484 5 5 4 5 5 5 5 5 5 5 5 5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[df$lea_4 == 99, ] # except for ali_2 & lea_4, all the other responses are in the normal scale range & 1 participant = M01455
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01455 5 99 3 5 4 2 5 5 5 4 4 5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
# recode values with 99 to -1.
df$ali_1[df$ali_1 == 99] <- -1
df$ali_2[df$ali_2 == 99] <- -1
df$ali_3[df$ali_3 == 99] <- -1
df$col_1[df$col_1 == 99] <- -1
df$col_2[df$col_2 == 99] <- -1
df$col_3[df$col_3 == 99] <- -1
df$lea_3[df$lea_3 == 99] <- -1
df$lea_4[df$lea_4 == 99] <- -1
df[is.na(df$col_1), ] #for employee_ID M02429, only col_1 is missing. It should be coded as -1.
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M02429 5 5 4 NA 5 5 4 3 2 4 4 5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[is.na(df$eng_2), ] #for employee_ID M01597, only eng_2 is missing. It should be coded as -1.
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01597 4 3 3 4 4 4 3 NA 3 3 3 4
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[is.na(df$inc_2), ] #for employee_ID M01481, only inc_2 is missing. It should be coded as -1.
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01481 3 4 4 3 4 4 4 3 3 4 4 4
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df[is.na(df$inc_5), ] #for employee_ID M01722, only inc_5 is missing. It should be coded as -1.
## # A tibble: 1 × 27
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01722 4 4 4 4 4 4 4 3 4 4 4 3
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>
df$col_1[is.na(df$col_1)] <- -1
df$eng_2[is.na(df$eng_2)] <- -1
df$inc_2[is.na(df$inc_2)] <- -1
df$inc_5[is.na(df$inc_5)] <- -1
sum(duplicated(df$eeid))
## [1] 0
table(df$country)
##
## Australia Canada China Denmark France
## 101 84 60 24 136
## Germany India UK United Kingdom United States
## 48 288 11 348 1520
## USA
## 31
df$country[df$country == "UK"] <- "United Kingdom"
df$country[df$country == "USA"] <- "United States"
table(df$country)
##
## Australia Canada China Denmark France
## 101 84 60 24 136
## Germany India United Kingdom United States
## 48 288 359 1551
table(df$age, useNA = "always")
##
## 18-24 19 21 25-34 26 28 35-44 39 42 45-54 48 55-64 65+
## 147 1 1 1266 2 1 795 1 1 301 1 78 54
## N/A <NA>
## 2 0
df$age[df$age == "19"] <- "18-24"
df$age[df$age == "21"] <- "18-24"
df$age[df$age == "26"] <- "25-34"
df$age[df$age == "28"] <- "25-34"
df$age[df$age == "39"] <- "35-44"
df$age[df$age == "42"] <- "35-44"
df$age[df$age == "48"] <- "45-54"
table(df$race, useNA = "always")
##
## American Indian/Alaskan Native
## 2
## Asian
## 359
## Black or African American
## 37
## Black or African Americans
## 11
## Hispanic or Latino
## 80
## Native Hawaiian or Other Pacific Islander
## 2
## Two or More Races
## 43
## White
## 1050
## <NA>
## 1067
df$race[df$race == "Black or African Americans"] <- "Black or African American"
table(df$gender, useNA = "always")
##
## Female Male Man Unknown Woman <NA>
## 816 1826 4 1 4 0
df$gender[df$gender == "Man"] <- "Male"
df$gender[df$gender == "Woman"] <- "Female"
40.2% of race variable’s observations (1067) are missing. Let’s see if the values in this variable has patterns.
Based on the contingency table below,
Except for the United States, all the countries with missing information about race are largely ethnically homogeneous (e.g., Denmark). Therefore, the company might not collect information about employee’s race as the variance in race among employees might be too small to be a meaningful (For example, according to demographic research, about 90.2 percent of the population in Australia are white). Therefore, conducting an analysis about race for other countries except for the United States would not provide meaningful insights to Hooli.
c_t <- table(df$race, df$country, useNA = "always")
c_t
##
## Australia Canada China Denmark
## American Indian/Alaskan Native 0 0 0 0
## Asian 0 0 0 0
## Black or African American 0 0 0 0
## Hispanic or Latino 0 0 0 0
## Native Hawaiian or Other Pacific Islander 0 0 0 0
## Two or More Races 0 0 0 0
## White 0 24 5 0
## <NA> 101 60 55 24
##
## France Germany India United Kingdom
## American Indian/Alaskan Native 0 0 0 0
## Asian 0 0 0 0
## Black or African American 0 0 0 0
## Hispanic or Latino 0 0 0 0
## Native Hawaiian or Other Pacific Islander 0 0 0 0
## Two or More Races 0 0 0 0
## White 0 0 0 4
## <NA> 136 48 288 355
##
## United States <NA>
## American Indian/Alaskan Native 2 0
## Asian 359 0
## Black or African American 48 0
## Hispanic or Latino 80 0
## Native Hawaiian or Other Pacific Islander 2 0
## Two or More Races 43 0
## White 1017 0
## <NA> 0 0
* 0% of 'hiredate' variable's observations are missing.
df_2 <- subset(df, select = c(hiredate))
library(Amelia)
missmap(df_2)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
rm(df_2)
#the earliest & latest date for hiredate
min(df$hiredate, na.rm = TRUE) # 2014-12-24
## [1] "2004-12-24"
max(df$hiredate, na.rm = TRUE) # 2024-02-29
## [1] "2024-02-29"
#let's create a new variable named hireyear
df$hireyear <- format(df$hiredate, "%Y")
table(df$hireyear, useNA = "always")
##
## 2004 2005 2006 2010 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 <NA>
## 1 1 1 1 41 40 31 42 125 129 307 308 632 954 38 0
library(ggplot2)
ggplot(df, aes(x=hireyear)) + geom_bar()
ggplot(df, aes(x=hiredate)) + geom_histogram() #this company hired employees a ton in 2022!!
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
* Create a new variable called ‘tenure’ by using hiredate.
Substract 'hiredate' from March 1st 2024, the date instructed by the technical assessment rubric to create different tenure groups by using 'hiredate'. It's important to note that there are 11 employees who were hired on or after March 1st 2024. However, given that I am asked to analyze the data as of March 1st 2024, I omit these 11 observations to not introduce a bias in my data. For example, introducing this 11 observations into under 3 months might change the characteristics of those in that group as of March 1st 2024.
- Given that there are only 4 employees in '10+ years' groups, we do not report any findings about this group.
#let's create the cutpoint, which is 2024-03-01, to create tenure groups
cutpoint <- as.Date("2024-03-01")
#given that the average number of days in a month is 365/12 = 30.4166666667,
tenure_days <- as.numeric(difftime(cutpoint, df$hiredate, units = "days"))
tenure_months <- as.numeric(difftime(cutpoint, df$hiredate, units = "days")/(365/12))
#create tenure_groups
tenure_groups <- cut(tenure_months,
breaks = c(0, 3, 6, 12, 24, 48, 72, 120, Inf), # In months
labels = c("Under 3 months", "3-6 months", "6-12 months", "1-2 years", "2-4 years", "4-6 years", "6-10 years", "10+ years"), right = FALSE) # right = FALSE to EXCLUDE the right side month in the cateogry
#the distribution of employees in each tenure group
table(tenure_groups, useNA = "always")
## tenure_groups
## Under 3 months 3-6 months 6-12 months 1-2 years 2-4 years
## 64 325 507 670 644
## 4-6 years 6-10 years 10+ years <NA>
## 274 163 4 0
# Add tenure_group variable to data frame
df$tenure_group <- as.character(tenure_groups)
# Print the data frame with the new tenure_group variable
table(df$tenure_group, useNA = "always")
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years
## 670 4 644 325 274
## 6-10 years 6-12 months Under 3 months <NA>
## 163 507 64 0
df_2 <- df
df_2$ali_1[df_2$ali_1 == -1] <- NA
df_2$ali_2[df_2$ali_2 == -1] <- NA
df_2$ali_3[df_2$ali_3 == -1] <- NA
df_2$col_1[df_2$col_1 == -1] <- NA
df_2$col_2[df_2$col_2 == -1] <- NA
df_2$col_3[df_2$col_3 == -1] <- NA
df_2$eng_1[df_2$eng_1 == -1] <- NA
df_2$eng_2[df_2$eng_2 == -1] <- NA
df_2$eng_3[df_2$eng_3 == -1] <- NA
df_2$eng_4[df_2$eng_4 == -1] <- NA
df_2$eng_5[df_2$eng_5 == -1] <- NA
df_2$inc_1[df_2$inc_1 == -1] <- NA
df_2$inc_2[df_2$inc_2 == -1] <- NA
df_2$inc_3[df_2$inc_3 == -1] <- NA
df_2$inc_4[df_2$inc_4 == -1] <- NA
df_2$inc_5[df_2$inc_5 == -1] <- NA
df_2$lea_1[df_2$lea_1 == -1] <- NA
df_2$lea_2[df_2$lea_2 == -1] <- NA
df_2$lea_3[df_2$lea_3 == -1] <- NA
df_2$lea_4[df_2$lea_4 == -1] <- NA
df_2$age[df_2$age == "N/A"] <- NA
#we convert "Unknown" to NA as this category has less than 5 people and we do not report.
df_2$gender[df_2$gender == "Unknown"] <- NA
dfSummary(df_2)
## Data Frame Summary
## df_2
## Dimensions: 2651 x 29
## Duplicates: 0
##
## ----------------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ---------------- ------------------------------ ---------------------- --------------------- ---------- ---------
## 1 eeid 1. M00001 1 ( 0.0%) 2651 0
## [character] 2. M00002 1 ( 0.0%) (100.0%) (0.0%)
## 3. M00003 1 ( 0.0%)
## 4. M00004 1 ( 0.0%)
## 5. M00005 1 ( 0.0%)
## 6. M00006 1 ( 0.0%)
## 7. M00007 1 ( 0.0%)
## 8. M00008 1 ( 0.0%)
## 9. M00009 1 ( 0.0%)
## 10. M00010 1 ( 0.0%)
## [ 2641 others ] 2641 (99.6%) IIIIIIIIIIIIIIIIIII
##
## 2 ali_1 Mean (sd) : 4.3 (0.8) 1 : 27 ( 1.0%) 2633 18
## [numeric] min < med < max: 2 : 64 ( 2.4%) (99.3%) (0.7%)
## 1 < 4 < 5 3 : 208 ( 7.9%) I
## IQR (CV) : 1 (0.2) 4 : 1124 (42.7%) IIIIIIII
## 5 : 1210 (46.0%) IIIIIIIII
##
## 3 ali_2 Mean (sd) : 4.2 (0.9) 1 : 44 ( 1.7%) 2627 24
## [numeric] min < med < max: 2 : 86 ( 3.3%) (99.1%) (0.9%)
## 1 < 4 < 5 3 : 310 (11.8%) II
## IQR (CV) : 1 (0.2) 4 : 1088 (41.4%) IIIIIIII
## 5 : 1099 (41.8%) IIIIIIII
##
## 4 ali_3 Mean (sd) : 3.8 (1.1) 1 : 101 ( 3.8%) 2627 24
## [numeric] min < med < max: 2 : 232 ( 8.8%) I (99.1%) (0.9%)
## 1 < 4 < 5 3 : 496 (18.9%) III
## IQR (CV) : 2 (0.3) 4 : 1086 (41.3%) IIIIIIII
## 5 : 712 (27.1%) IIIII
##
## 5 col_1 Mean (sd) : 4.1 (0.9) 1 : 33 ( 1.3%) 2638 13
## [numeric] min < med < max: 2 : 114 ( 4.3%) (99.5%) (0.5%)
## 1 < 4 < 5 3 : 318 (12.1%) II
## IQR (CV) : 1 (0.2) 4 : 1135 (43.0%) IIIIIIII
## 5 : 1038 (39.3%) IIIIIII
##
## 6 col_2 Mean (sd) : 3.8 (0.9) 1 : 43 ( 1.7%) 2574 77
## [numeric] min < med < max: 2 : 130 ( 5.1%) I (97.1%) (2.9%)
## 1 < 4 < 5 3 : 853 (33.1%) IIIIII
## IQR (CV) : 2 (0.2) 4 : 900 (35.0%) IIIIII
## 5 : 648 (25.2%) IIIII
##
## 7 col_3 Mean (sd) : 4.1 (1) 1 : 55 ( 2.1%) 2626 25
## [numeric] min < med < max: 2 : 128 ( 4.9%) (99.1%) (0.9%)
## 1 < 4 < 5 3 : 327 (12.5%) II
## IQR (CV) : 1 (0.2) 4 : 1037 (39.5%) IIIIIII
## 5 : 1079 (41.1%) IIIIIIII
##
## 8 eng_1 Mean (sd) : 4.5 (0.7) 1 : 5 ( 0.2%) 2645 6
## [numeric] min < med < max: 2 : 23 ( 0.9%) (99.8%) (0.2%)
## 1 < 5 < 5 3 : 181 ( 6.8%) I
## IQR (CV) : 1 (0.2) 4 : 869 (32.9%) IIIIII
## 5 : 1567 (59.2%) IIIIIIIIIII
##
## 9 eng_2 Mean (sd) : 4.3 (0.9) 1 : 48 ( 1.8%) 2646 5
## [numeric] min < med < max: 2 : 104 ( 3.9%) (99.8%) (0.2%)
## 1 < 5 < 5 3 : 243 ( 9.2%) I
## IQR (CV) : 1 (0.2) 4 : 827 (31.3%) IIIIII
## 5 : 1424 (53.8%) IIIIIIIIII
##
## 10 eng_3 Mean (sd) : 4.2 (0.9) 1 : 51 ( 1.9%) 2639 12
## [numeric] min < med < max: 2 : 95 ( 3.6%) (99.5%) (0.5%)
## 1 < 4 < 5 3 : 379 (14.4%) II
## IQR (CV) : 1 (0.2) 4 : 995 (37.7%) IIIIIII
## 5 : 1119 (42.4%) IIIIIIII
##
## 11 eng_4 Mean (sd) : 3.9 (1) 1 : 60 ( 2.3%) 2643 8
## [numeric] min < med < max: 2 : 249 ( 9.4%) I (99.7%) (0.3%)
## 1 < 4 < 5 3 : 523 (19.8%) III
## IQR (CV) : 2 (0.3) 4 : 950 (35.9%) IIIIIII
## 5 : 861 (32.6%) IIIIII
##
## 12 eng_5 Mean (sd) : 4 (1) 1 : 55 ( 2.1%) 2643 8
## [numeric] min < med < max: 2 : 163 ( 6.2%) I (99.7%) (0.3%)
## 1 < 4 < 5 3 : 516 (19.5%) III
## IQR (CV) : 2 (0.3) 4 : 896 (33.9%) IIIIII
## 5 : 1013 (38.3%) IIIIIII
##
## 13 inc_1 Mean (sd) : 4.4 (0.8) 1 : 26 ( 1.0%) 2632 19
## [numeric] min < med < max: 2 : 52 ( 2.0%) (99.3%) (0.7%)
## 1 < 5 < 5 3 : 196 ( 7.4%) I
## IQR (CV) : 1 (0.2) 4 : 828 (31.5%) IIIIII
## 5 : 1530 (58.1%) IIIIIIIIIII
##
## 14 inc_2 Mean (sd) : 4 (0.9) 1 : 30 ( 1.2%) 2572 79
## [numeric] min < med < max: 2 : 103 ( 4.0%) (97.0%) (3.0%)
## 1 < 4 < 5 3 : 575 (22.4%) IIII
## IQR (CV) : 2 (0.2) 4 : 908 (35.3%) IIIIIII
## 5 : 956 (37.2%) IIIIIII
##
## 15 inc_3 Mean (sd) : 4.2 (0.9) 1 : 28 ( 1.1%) 2601 50
## [numeric] min < med < max: 2 : 91 ( 3.5%) (98.1%) (1.9%)
## 1 < 4 < 5 3 : 370 (14.2%) II
## IQR (CV) : 1 (0.2) 4 : 916 (35.2%) IIIIIII
## 5 : 1196 (46.0%) IIIIIIIII
##
## 16 inc_4 Mean (sd) : 4 (0.9) 1 : 33 ( 1.3%) 2587 64
## [numeric] min < med < max: 2 : 110 ( 4.3%) (97.6%) (2.4%)
## 1 < 4 < 5 3 : 634 (24.5%) IIII
## IQR (CV) : 2 (0.2) 4 : 895 (34.6%) IIIIII
## 5 : 915 (35.4%) IIIIIII
##
## 17 inc_5 Mean (sd) : 4.1 (1) 1 : 71 ( 2.7%) 2604 47
## [numeric] min < med < max: 2 : 125 ( 4.8%) (98.2%) (1.8%)
## 1 < 4 < 5 3 : 390 (15.0%) II
## IQR (CV) : 1 (0.2) 4 : 1018 (39.1%) IIIIIII
## 5 : 1000 (38.4%) IIIIIII
##
## 18 lea_1 Mean (sd) : 4.2 (0.8) 1 : 24 ( 0.9%) 2631 20
## [numeric] min < med < max: 2 : 79 ( 3.0%) (99.2%) (0.8%)
## 1 < 4 < 5 3 : 335 (12.7%) II
## IQR (CV) : 1 (0.2) 4 : 1074 (40.8%) IIIIIIII
## 5 : 1119 (42.5%) IIIIIIII
##
## 19 lea_2 Mean (sd) : 4.3 (0.9) 1 : 35 ( 1.3%) 2632 19
## [numeric] min < med < max: 2 : 75 ( 2.8%) (99.3%) (0.7%)
## 1 < 4 < 5 3 : 287 (10.9%) II
## IQR (CV) : 1 (0.2) 4 : 979 (37.2%) IIIIIII
## 5 : 1256 (47.7%) IIIIIIIII
##
## 20 lea_3 Mean (sd) : 4.3 (0.8) 1 : 22 ( 0.8%) 2637 14
## [numeric] min < med < max: 2 : 58 ( 2.2%) (99.5%) (0.5%)
## 1 < 4 < 5 3 : 197 ( 7.5%) I
## IQR (CV) : 1 (0.2) 4 : 1147 (43.5%) IIIIIIII
## 5 : 1213 (46.0%) IIIIIIIII
##
## 21 lea_4 Mean (sd) : 4.4 (0.8) 1 : 15 ( 0.6%) 2634 17
## [numeric] min < med < max: 2 : 41 ( 1.6%) (99.4%) (0.6%)
## 1 < 5 < 5 3 : 267 (10.1%) II
## IQR (CV) : 1 (0.2) 4 : 930 (35.3%) IIIIIII
## 5 : 1381 (52.4%) IIIIIIIIII
##
## 22 age 1. 18-24 149 ( 5.6%) I 2649 2
## [character] 2. 25-34 1269 (47.9%) IIIIIIIII (99.9%) (0.1%)
## 3. 35-44 797 (30.1%) IIIIII
## 4. 45-54 302 (11.4%) II
## 5. 55-64 78 ( 2.9%)
## 6. 65+ 54 ( 2.0%)
##
## 23 hiredate min : 2004-12-24 1436 distinct values : 2651 0
## [Date] med : 2022-06-30 : (100.0%) (0.0%)
## max : 2024-02-29 :
## range : 19y 2m 5d : :
## . . : : :
##
## 24 race 1. American Indian/Alaskan N 2 ( 0.1%) 1584 1067
## [character] 2. Asian 359 (22.7%) IIII (59.8%) (40.2%)
## 3. Black or African American 48 ( 3.0%)
## 4. Hispanic or Latino 80 ( 5.1%) I
## 5. Native Hawaiian or Other 2 ( 0.1%)
## 6. Two or More Races 43 ( 2.7%)
## 7. White 1050 (66.3%) IIIIIIIIIIIII
##
## 25 gender 1. Female 820 (30.9%) IIIIII 2650 1
## [character] 2. Male 1830 (69.1%) IIIIIIIIIIIII (100.0%) (0.0%)
##
## 26 manager_status 1. Manager 654 (24.7%) IIII 2651 0
## [character] 2. Non-Manager 1997 (75.3%) IIIIIIIIIIIIIII (100.0%) (0.0%)
##
## 27 country 1. Australia 101 ( 3.8%) 2651 0
## [character] 2. Canada 84 ( 3.2%) (100.0%) (0.0%)
## 3. China 60 ( 2.3%)
## 4. Denmark 24 ( 0.9%)
## 5. France 136 ( 5.1%) I
## 6. Germany 48 ( 1.8%)
## 7. India 288 (10.9%) II
## 8. United Kingdom 359 (13.5%) II
## 9. United States 1551 (58.5%) IIIIIIIIIII
##
## 28 hireyear 1. 2023 954 (36.0%) IIIIIII 2651 0
## [character] 2. 2022 632 (23.8%) IIII (100.0%) (0.0%)
## 3. 2021 308 (11.6%) II
## 4. 2020 307 (11.6%) II
## 5. 2019 129 ( 4.9%)
## 6. 2018 125 ( 4.7%)
## 7. 2017 42 ( 1.6%)
## 8. 2014 41 ( 1.5%)
## 9. 2015 40 ( 1.5%)
## 10. 2024 38 ( 1.4%)
## [ 5 others ] 35 ( 1.3%)
##
## 29 tenure_group 1. 1-2 years 670 (25.3%) IIIII 2651 0
## [character] 2. 10+ years 4 ( 0.2%) (100.0%) (0.0%)
## 3. 2-4 years 644 (24.3%) IIII
## 4. 3-6 months 325 (12.3%) II
## 5. 4-6 years 274 (10.3%) II
## 6. 6-10 years 163 ( 6.1%) I
## 7. 6-12 months 507 (19.1%) III
## 8. Under 3 months 64 ( 2.4%)
## ----------------------------------------------------------------------------------------------------------------------
#the end of data-processing & preparation for analysis
# 2. Data Analysis
dfSummary(df_2)
## Data Frame Summary
## df_2
## Dimensions: 2651 x 29
## Duplicates: 0
##
## ----------------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ---------------- ------------------------------ ---------------------- --------------------- ---------- ---------
## 1 eeid 1. M00001 1 ( 0.0%) 2651 0
## [character] 2. M00002 1 ( 0.0%) (100.0%) (0.0%)
## 3. M00003 1 ( 0.0%)
## 4. M00004 1 ( 0.0%)
## 5. M00005 1 ( 0.0%)
## 6. M00006 1 ( 0.0%)
## 7. M00007 1 ( 0.0%)
## 8. M00008 1 ( 0.0%)
## 9. M00009 1 ( 0.0%)
## 10. M00010 1 ( 0.0%)
## [ 2641 others ] 2641 (99.6%) IIIIIIIIIIIIIIIIIII
##
## 2 ali_1 Mean (sd) : 4.3 (0.8) 1 : 27 ( 1.0%) 2633 18
## [numeric] min < med < max: 2 : 64 ( 2.4%) (99.3%) (0.7%)
## 1 < 4 < 5 3 : 208 ( 7.9%) I
## IQR (CV) : 1 (0.2) 4 : 1124 (42.7%) IIIIIIII
## 5 : 1210 (46.0%) IIIIIIIII
##
## 3 ali_2 Mean (sd) : 4.2 (0.9) 1 : 44 ( 1.7%) 2627 24
## [numeric] min < med < max: 2 : 86 ( 3.3%) (99.1%) (0.9%)
## 1 < 4 < 5 3 : 310 (11.8%) II
## IQR (CV) : 1 (0.2) 4 : 1088 (41.4%) IIIIIIII
## 5 : 1099 (41.8%) IIIIIIII
##
## 4 ali_3 Mean (sd) : 3.8 (1.1) 1 : 101 ( 3.8%) 2627 24
## [numeric] min < med < max: 2 : 232 ( 8.8%) I (99.1%) (0.9%)
## 1 < 4 < 5 3 : 496 (18.9%) III
## IQR (CV) : 2 (0.3) 4 : 1086 (41.3%) IIIIIIII
## 5 : 712 (27.1%) IIIII
##
## 5 col_1 Mean (sd) : 4.1 (0.9) 1 : 33 ( 1.3%) 2638 13
## [numeric] min < med < max: 2 : 114 ( 4.3%) (99.5%) (0.5%)
## 1 < 4 < 5 3 : 318 (12.1%) II
## IQR (CV) : 1 (0.2) 4 : 1135 (43.0%) IIIIIIII
## 5 : 1038 (39.3%) IIIIIII
##
## 6 col_2 Mean (sd) : 3.8 (0.9) 1 : 43 ( 1.7%) 2574 77
## [numeric] min < med < max: 2 : 130 ( 5.1%) I (97.1%) (2.9%)
## 1 < 4 < 5 3 : 853 (33.1%) IIIIII
## IQR (CV) : 2 (0.2) 4 : 900 (35.0%) IIIIII
## 5 : 648 (25.2%) IIIII
##
## 7 col_3 Mean (sd) : 4.1 (1) 1 : 55 ( 2.1%) 2626 25
## [numeric] min < med < max: 2 : 128 ( 4.9%) (99.1%) (0.9%)
## 1 < 4 < 5 3 : 327 (12.5%) II
## IQR (CV) : 1 (0.2) 4 : 1037 (39.5%) IIIIIII
## 5 : 1079 (41.1%) IIIIIIII
##
## 8 eng_1 Mean (sd) : 4.5 (0.7) 1 : 5 ( 0.2%) 2645 6
## [numeric] min < med < max: 2 : 23 ( 0.9%) (99.8%) (0.2%)
## 1 < 5 < 5 3 : 181 ( 6.8%) I
## IQR (CV) : 1 (0.2) 4 : 869 (32.9%) IIIIII
## 5 : 1567 (59.2%) IIIIIIIIIII
##
## 9 eng_2 Mean (sd) : 4.3 (0.9) 1 : 48 ( 1.8%) 2646 5
## [numeric] min < med < max: 2 : 104 ( 3.9%) (99.8%) (0.2%)
## 1 < 5 < 5 3 : 243 ( 9.2%) I
## IQR (CV) : 1 (0.2) 4 : 827 (31.3%) IIIIII
## 5 : 1424 (53.8%) IIIIIIIIII
##
## 10 eng_3 Mean (sd) : 4.2 (0.9) 1 : 51 ( 1.9%) 2639 12
## [numeric] min < med < max: 2 : 95 ( 3.6%) (99.5%) (0.5%)
## 1 < 4 < 5 3 : 379 (14.4%) II
## IQR (CV) : 1 (0.2) 4 : 995 (37.7%) IIIIIII
## 5 : 1119 (42.4%) IIIIIIII
##
## 11 eng_4 Mean (sd) : 3.9 (1) 1 : 60 ( 2.3%) 2643 8
## [numeric] min < med < max: 2 : 249 ( 9.4%) I (99.7%) (0.3%)
## 1 < 4 < 5 3 : 523 (19.8%) III
## IQR (CV) : 2 (0.3) 4 : 950 (35.9%) IIIIIII
## 5 : 861 (32.6%) IIIIII
##
## 12 eng_5 Mean (sd) : 4 (1) 1 : 55 ( 2.1%) 2643 8
## [numeric] min < med < max: 2 : 163 ( 6.2%) I (99.7%) (0.3%)
## 1 < 4 < 5 3 : 516 (19.5%) III
## IQR (CV) : 2 (0.3) 4 : 896 (33.9%) IIIIII
## 5 : 1013 (38.3%) IIIIIII
##
## 13 inc_1 Mean (sd) : 4.4 (0.8) 1 : 26 ( 1.0%) 2632 19
## [numeric] min < med < max: 2 : 52 ( 2.0%) (99.3%) (0.7%)
## 1 < 5 < 5 3 : 196 ( 7.4%) I
## IQR (CV) : 1 (0.2) 4 : 828 (31.5%) IIIIII
## 5 : 1530 (58.1%) IIIIIIIIIII
##
## 14 inc_2 Mean (sd) : 4 (0.9) 1 : 30 ( 1.2%) 2572 79
## [numeric] min < med < max: 2 : 103 ( 4.0%) (97.0%) (3.0%)
## 1 < 4 < 5 3 : 575 (22.4%) IIII
## IQR (CV) : 2 (0.2) 4 : 908 (35.3%) IIIIIII
## 5 : 956 (37.2%) IIIIIII
##
## 15 inc_3 Mean (sd) : 4.2 (0.9) 1 : 28 ( 1.1%) 2601 50
## [numeric] min < med < max: 2 : 91 ( 3.5%) (98.1%) (1.9%)
## 1 < 4 < 5 3 : 370 (14.2%) II
## IQR (CV) : 1 (0.2) 4 : 916 (35.2%) IIIIIII
## 5 : 1196 (46.0%) IIIIIIIII
##
## 16 inc_4 Mean (sd) : 4 (0.9) 1 : 33 ( 1.3%) 2587 64
## [numeric] min < med < max: 2 : 110 ( 4.3%) (97.6%) (2.4%)
## 1 < 4 < 5 3 : 634 (24.5%) IIII
## IQR (CV) : 2 (0.2) 4 : 895 (34.6%) IIIIII
## 5 : 915 (35.4%) IIIIIII
##
## 17 inc_5 Mean (sd) : 4.1 (1) 1 : 71 ( 2.7%) 2604 47
## [numeric] min < med < max: 2 : 125 ( 4.8%) (98.2%) (1.8%)
## 1 < 4 < 5 3 : 390 (15.0%) II
## IQR (CV) : 1 (0.2) 4 : 1018 (39.1%) IIIIIII
## 5 : 1000 (38.4%) IIIIIII
##
## 18 lea_1 Mean (sd) : 4.2 (0.8) 1 : 24 ( 0.9%) 2631 20
## [numeric] min < med < max: 2 : 79 ( 3.0%) (99.2%) (0.8%)
## 1 < 4 < 5 3 : 335 (12.7%) II
## IQR (CV) : 1 (0.2) 4 : 1074 (40.8%) IIIIIIII
## 5 : 1119 (42.5%) IIIIIIII
##
## 19 lea_2 Mean (sd) : 4.3 (0.9) 1 : 35 ( 1.3%) 2632 19
## [numeric] min < med < max: 2 : 75 ( 2.8%) (99.3%) (0.7%)
## 1 < 4 < 5 3 : 287 (10.9%) II
## IQR (CV) : 1 (0.2) 4 : 979 (37.2%) IIIIIII
## 5 : 1256 (47.7%) IIIIIIIII
##
## 20 lea_3 Mean (sd) : 4.3 (0.8) 1 : 22 ( 0.8%) 2637 14
## [numeric] min < med < max: 2 : 58 ( 2.2%) (99.5%) (0.5%)
## 1 < 4 < 5 3 : 197 ( 7.5%) I
## IQR (CV) : 1 (0.2) 4 : 1147 (43.5%) IIIIIIII
## 5 : 1213 (46.0%) IIIIIIIII
##
## 21 lea_4 Mean (sd) : 4.4 (0.8) 1 : 15 ( 0.6%) 2634 17
## [numeric] min < med < max: 2 : 41 ( 1.6%) (99.4%) (0.6%)
## 1 < 5 < 5 3 : 267 (10.1%) II
## IQR (CV) : 1 (0.2) 4 : 930 (35.3%) IIIIIII
## 5 : 1381 (52.4%) IIIIIIIIII
##
## 22 age 1. 18-24 149 ( 5.6%) I 2649 2
## [character] 2. 25-34 1269 (47.9%) IIIIIIIII (99.9%) (0.1%)
## 3. 35-44 797 (30.1%) IIIIII
## 4. 45-54 302 (11.4%) II
## 5. 55-64 78 ( 2.9%)
## 6. 65+ 54 ( 2.0%)
##
## 23 hiredate min : 2004-12-24 1436 distinct values : 2651 0
## [Date] med : 2022-06-30 : (100.0%) (0.0%)
## max : 2024-02-29 :
## range : 19y 2m 5d : :
## . . : : :
##
## 24 race 1. American Indian/Alaskan N 2 ( 0.1%) 1584 1067
## [character] 2. Asian 359 (22.7%) IIII (59.8%) (40.2%)
## 3. Black or African American 48 ( 3.0%)
## 4. Hispanic or Latino 80 ( 5.1%) I
## 5. Native Hawaiian or Other 2 ( 0.1%)
## 6. Two or More Races 43 ( 2.7%)
## 7. White 1050 (66.3%) IIIIIIIIIIIII
##
## 25 gender 1. Female 820 (30.9%) IIIIII 2650 1
## [character] 2. Male 1830 (69.1%) IIIIIIIIIIIII (100.0%) (0.0%)
##
## 26 manager_status 1. Manager 654 (24.7%) IIII 2651 0
## [character] 2. Non-Manager 1997 (75.3%) IIIIIIIIIIIIIII (100.0%) (0.0%)
##
## 27 country 1. Australia 101 ( 3.8%) 2651 0
## [character] 2. Canada 84 ( 3.2%) (100.0%) (0.0%)
## 3. China 60 ( 2.3%)
## 4. Denmark 24 ( 0.9%)
## 5. France 136 ( 5.1%) I
## 6. Germany 48 ( 1.8%)
## 7. India 288 (10.9%) II
## 8. United Kingdom 359 (13.5%) II
## 9. United States 1551 (58.5%) IIIIIIIIIII
##
## 28 hireyear 1. 2023 954 (36.0%) IIIIIII 2651 0
## [character] 2. 2022 632 (23.8%) IIII (100.0%) (0.0%)
## 3. 2021 308 (11.6%) II
## 4. 2020 307 (11.6%) II
## 5. 2019 129 ( 4.9%)
## 6. 2018 125 ( 4.7%)
## 7. 2017 42 ( 1.6%)
## 8. 2014 41 ( 1.5%)
## 9. 2015 40 ( 1.5%)
## 10. 2024 38 ( 1.4%)
## [ 5 others ] 35 ( 1.3%)
##
## 29 tenure_group 1. 1-2 years 670 (25.3%) IIIII 2651 0
## [character] 2. 10+ years 4 ( 0.2%) (100.0%) (0.0%)
## 3. 2-4 years 644 (24.3%) IIII
## 4. 3-6 months 325 (12.3%) II
## 5. 4-6 years 274 (10.3%) II
## 6. 6-10 years 163 ( 6.1%) I
## 7. 6-12 months 507 (19.1%) III
## 8. Under 3 months 64 ( 2.4%)
## ----------------------------------------------------------------------------------------------------------------------
favorability_ali_1 <- sum(df_2$ali_1 %in% c(4,5))/sum(df_2$ali_1 %in% c(1:5))
favorability_ali_1
## [1] 0.8864413
#let's create a function that automatates the process of calculating favorability scores for all questions at once:
function_favorability <- function(df) {
favorability_score <- numeric(ncol(df)) # Include all columns
column_names <- names(df) # Get the column names
for (i in 1:ncol(df)) { # Start from column 1
# Calculate the proportion
favorability_score[i] <- sum(df[[i]] %in% c(4,5)) / sum(df[[i]] %in% c(1:5))
}
# Combine favorability scores and column names into a data frame
result <- data.frame(column_name = column_names, favorability_score = favorability_score)
return(result)
}
# Calculate favorability_score for columns 2 to 21
favorability_score <- function_favorability(df_2[, 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8864413
## 2 ali_2 0.8325086
## 3 ali_3 0.6844309
## 4 col_1 0.8237301
## 5 col_2 0.6013986
## 6 col_3 0.8057883
## 7 eng_1 0.9209830
## 8 eng_2 0.8507181
## 9 eng_3 0.8010610
## 10 eng_4 0.6852062
## 11 eng_5 0.7222853
## 12 inc_1 0.8958967
## 13 inc_2 0.7247278
## 14 inc_3 0.8119954
## 15 inc_4 0.6996521
## 16 inc_5 0.7749616
## 17 lea_1 0.8335234
## 18 lea_2 0.8491641
## 19 lea_3 0.8949564
## 20 lea_4 0.8773728
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
print(favorability_score_percent)
## question_number favorability_score
## 1 ali_1 89
## 2 ali_2 83
## 3 ali_3 68
## 4 col_1 82
## 5 col_2 60
## 6 col_3 81
## 7 eng_1 92
## 8 eng_2 85
## 9 eng_3 80
## 10 eng_4 69
## 11 eng_5 72
## 12 inc_1 90
## 13 inc_2 72
## 14 inc_3 81
## 15 inc_4 70
## 16 inc_5 77
## 17 lea_1 83
## 18 lea_2 85
## 19 lea_3 89
## 20 lea_4 88
#graph favorable score for each question
##setting different colors for different factors
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
##
group_colors <- c(
"Alignment" = "#D08770", # Light orange for Group 1
"Collaboration" = "#A3BE8C", # Light green for Group 2
"Engagement" = "#5E81AC", # Light blue for Group 3
"Inclusion" = "#EBCB8B", # Light yellow for Group 4
"Leadership" = "#B48EAD" # Mild purple for Group 5
)
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Question Item") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
- b. Calculate a ‘Factor Favorable Score’, which is the average
favorable score among questions within a factor, for each factor.
#let's create a function that automates the process of calculating of factor favorable score.
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8011269
## 2 col 0.7436390
## 3 eng 0.7960507
## 4 inc 0.7814467
## 5 lea 0.8637542
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 80 Alignment
## 1 ali_1 89 Alignment
## 2 ali_2 83 Alignment
## 3 ali_3 68 Alignment
## 22 col 74 Collaboration
## 4 col_1 82 Collaboration
## 5 col_2 60 Collaboration
## 6 col_3 81 Collaboration
## 23 eng 80 Engagement
## 7 eng_1 92 Engagement
## 8 eng_2 85 Engagement
## 9 eng_3 80 Engagement
## 10 eng_4 69 Engagement
## 11 eng_5 72 Engagement
## 24 inc 78 Inclusion
## 12 inc_1 90 Inclusion
## 13 inc_2 72 Inclusion
## 14 inc_3 81 Inclusion
## 15 inc_4 70 Inclusion
## 16 inc_5 77 Inclusion
## 25 lea 86 Leadership
## 17 lea_1 83 Leadership
## 18 lea_2 85 Leadership
## 19 lea_3 89 Leadership
## 20 lea_4 88 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's enagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "Overall",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 Overall 0.7960507
#create a graph that shows enagement favorable score across tenure groups (all)
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 80 Engagement
## 7 eng_1 92 Engagement
## 8 eng_2 85 Engagement
## 9 eng_3 80 Engagement
## 10 eng_4 69 Engagement
## 11 eng_5 72 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # Purple for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
* 2. Calculate engagement factor (favorable) scores by all tenure groups
(need to clarify this question..) - a. engagement factor favorable
scores for employees whose tenure are “Under 3 Months”. n = 64
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$tenure_group == "Under 3 months", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.984375
## 2 eng_2 0.859375
## 3 eng_3 0.906250
## 4 eng_4 0.890625
## 5 eng_5 0.828125
# Calculate the average engagement factor favorable score for those under 3 months
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
average_row <- data.frame(
question_number = "average",
favorability_score = average_score
)
favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
## question_number favorability_score
## 1 eng_1 0.984375
## 2 eng_2 0.859375
## 3 eng_3 0.906250
## 4 eng_4 0.890625
## 5 eng_5 0.828125
## 6 average 0.893750
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100
print(favorability_score_with_average_percent)
## question_number favorability_score
## 1 eng_1 98
## 2 eng_2 86
## 3 eng_3 91
## 4 eng_4 89
## 5 eng_5 83
## 6 average 89
#visualize the favorability scores
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # Purple for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with Under 3 Months Tenure") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "Under 3 Months"
#3-6 months
df_2[df_2$tenure_group == "3-6 months" & !is.na(df_2$tenure_group), ]
## # A tibble: 325 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M003… 5 5 4 5 4 5 5 5 5 4 3 5
## 2 M001… 5 5 3 5 3 5 5 5 5 4 5 5
## 3 M002… 5 5 5 5 5 5 5 5 5 4 4 5
## 4 M021… 5 4 4 4 3 5 5 3 4 2 4 5
## 5 M024… 5 4 2 4 3 3 4 5 4 3 3 4
## 6 M014… 4 4 4 4 4 4 4 2 3 2 4 3
## 7 M017… 5 5 5 5 5 5 5 5 5 5 5 5
## 8 M020… 4 4 3 4 4 4 5 5 4 4 5 5
## 9 M017… 3 4 3 4 3 5 5 5 5 4 5 2
## 10 M019… 5 5 5 5 5 5 5 5 5 5 5 5
## # ℹ 315 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "3-6 months", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9506173
## 2 eng_2 0.9040248
## 3 eng_3 0.8796296
## 4 eng_4 0.8209877
## 5 eng_5 0.8580247
# Calculate the average engagement factor favorable score for those with 3-6 months tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
average_row <- data.frame(
question_number = "average",
favorability_score = average_score
)
favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
## question_number favorability_score
## 1 eng_1 0.9506173
## 2 eng_2 0.9040248
## 3 eng_3 0.8796296
## 4 eng_4 0.8209877
## 5 eng_5 0.8580247
## 6 average 0.8826568
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100
print(favorability_score_with_average_percent)
## question_number favorability_score
## 1 eng_1 95
## 2 eng_2 90
## 3 eng_3 88
## 4 eng_4 82
## 5 eng_5 86
## 6 average 88
#visualize the favorability scores
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")
# graph showing engagement factor scores for the 3-6 months tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 3-6 Months Tenure") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "3-6 Months"
- c.engagement factor favorability scores for employees whose tenure are "6-12 months".
n = 507
#6-12 months
df_2[df_2$tenure_group == "6-12 months" & !is.na(df_2$tenure_group), ]
## # A tibble: 507 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M012… 4 3 4 3 3 3 4 3 4 3 3 4
## 2 M025… 5 5 4 4 5 5 5 4 5 4 5 5
## 3 M007… 4 3 3 5 3 4 4 4 3 3 3 4
## 4 M020… 5 5 5 5 5 5 5 5 4 5 5 5
## 5 M011… 4 5 4 4 4 5 5 5 5 5 5 4
## 6 M011… 5 4 4 5 2 5 5 4 5 2 3 5
## 7 M007… 4 4 4 4 4 4 5 4 4 5 4 5
## 8 M024… 2 4 3 3 4 4 3 1 2 3 2 3
## 9 M012… 4 2 2 4 5 3 4 3 4 3 3 5
## 10 M019… 5 5 5 5 5 5 5 5 5 5 5 5
## # ℹ 497 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "6-12 months", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9090909
## 2 eng_2 0.8106509
## 3 eng_3 0.8134921
## 4 eng_4 0.7199211
## 5 eng_5 0.7312253
# Calculate the average engagement factor favorable score for those with 6-12 months tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
average_row <- data.frame(
question_number = "average",
favorability_score = average_score
)
favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
## question_number favorability_score
## 1 eng_1 0.9090909
## 2 eng_2 0.8106509
## 3 eng_3 0.8134921
## 4 eng_4 0.7199211
## 5 eng_5 0.7312253
## 6 average 0.7968761
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100
print(favorability_score_with_average_percent)
## question_number favorability_score
## 1 eng_1 91
## 2 eng_2 81
## 3 eng_3 81
## 4 eng_4 72
## 5 eng_5 73
## 6 average 80
#visualize the favorability scores
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")
# graph showing engagement factor scores for the 6-12 months tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 6-12 Months Tenure") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "6-12 Months"
- d. engagement factor favorability scores for employees whose tenure are "1-2 years".
N = 670
#1-2 years
df_2[df_2$tenure_group == "1-2 years" & !is.na(df_2$tenure_group), ]
## # A tibble: 670 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M006… 4 4 4 4 4 4 4 4 4 3 3 4
## 2 M017… 4 4 3 4 4 4 4 4 3 4 4 4
## 3 M014… 5 5 4 5 5 5 5 5 4 3 3 5
## 4 M013… 4 5 4 5 4 5 5 5 5 4 5 5
## 5 M000… 5 4 4 3 5 3 5 5 5 5 5 5
## 6 M020… 5 4 3 5 3 4 5 5 3 4 4 5
## 7 M016… 3 3 2 4 3 4 4 4 4 3 3 3
## 8 M002… 4 4 5 4 3 5 5 5 4 3 4 5
## 9 M017… 4 4 3 4 4 4 4 4 5 4 4 4
## 10 M012… 5 5 4 5 4 4 4 4 4 5 5 4
## # ℹ 660 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "1-2 years", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9235382
## 2 eng_2 0.8579970
## 3 eng_3 0.7976012
## 4 eng_4 0.6616766
## 5 eng_5 0.7140719
# Calculate the average engagement factor favorable score for those with 1-2 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
average_row <- data.frame(
question_number = "average",
favorability_score = average_score
)
favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
## question_number favorability_score
## 1 eng_1 0.9235382
## 2 eng_2 0.8579970
## 3 eng_3 0.7976012
## 4 eng_4 0.6616766
## 5 eng_5 0.7140719
## 6 average 0.7909770
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100
print(favorability_score_with_average_percent)
## question_number favorability_score
## 1 eng_1 92
## 2 eng_2 86
## 3 eng_3 80
## 4 eng_4 66
## 5 eng_5 71
## 6 average 79
#visualize the favorability scores
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")
# graph showing engagement factor scores for the 1-2 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 1-2 years Tenure") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "1-2 Years"
- e. engagement factor favorability scores for employees whose tenure are "2-4 years".
n = 644
#2-4 years
df_2[df_2$tenure_group == "2-4 years" & !is.na(df_2$tenure_group), ]
## # A tibble: 644 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M010… 5 5 5 5 4 5 5 5 4 5 3 5
## 2 M017… NA NA NA NA NA NA 4 NA NA NA NA NA
## 3 M004… 4 5 5 4 5 5 5 3 4 4 4 4
## 4 M014… 4 4 2 3 2 3 4 4 3 2 3 5
## 5 M009… 4 5 4 4 5 5 4 1 1 3 4 5
## 6 M008… 5 5 5 5 5 5 5 5 5 5 5 5
## 7 M020… 5 5 5 5 4 5 5 5 5 5 5 5
## 8 M024… 5 5 4 3 5 5 5 5 3 2 3 5
## 9 M021… 3 5 4 2 5 5 3 2 3 2 3 3
## 10 M022… 4 2 4 4 2 1 3 3 5 3 3 4
## # ℹ 634 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "2-4 years", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9144635
## 2 eng_2 0.8429238
## 3 eng_3 0.7659906
## 4 eng_4 0.6209048
## 5 eng_5 0.6931464
# Calculate the average engagement factor favorable score for those with 2-4 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
average_row <- data.frame(
question_number = "average",
favorability_score = average_score
)
favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
## question_number favorability_score
## 1 eng_1 0.9144635
## 2 eng_2 0.8429238
## 3 eng_3 0.7659906
## 4 eng_4 0.6209048
## 5 eng_5 0.6931464
## 6 average 0.7674858
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100
print(favorability_score_with_average_percent)
## question_number favorability_score
## 1 eng_1 91
## 2 eng_2 84
## 3 eng_3 77
## 4 eng_4 62
## 5 eng_5 69
## 6 average 77
#visualize the favorability scores
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")
# graph showing engagement factor scores for the 2-4 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 2-4 years Tenure") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "2-4 Years"
- f. engagement factor scores for employees whose tenure are "4-6 years".
N = 274
#4-6 years
df_2[df_2$tenure_group == "4-6 years" & !is.na(df_2$tenure_group), ]
## # A tibble: 274 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M008… 5 5 5 5 5 5 5 5 5 5 5 5
## 2 M001… 5 5 4 5 5 5 5 5 5 4 3 5
## 3 M002… 5 5 4 5 4 4 5 5 5 5 5 5
## 4 M018… 4 4 4 2 5 5 4 1 3 4 3 4
## 5 M013… 5 5 4 4 NA 5 5 5 5 4 5 5
## 6 M006… 5 5 2 5 3 4 5 5 5 4 2 4
## 7 M009… 5 5 3 5 5 5 5 5 5 5 5 5
## 8 M005… 4 4 4 4 3 3 4 4 4 4 4 4
## 9 M023… 5 5 5 4 5 5 5 4 5 3 3 5
## 10 M009… 5 5 4 5 5 5 5 5 5 5 5 5
## # ℹ 264 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "4-6 years", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9014599
## 2 eng_2 0.8534799
## 3 eng_3 0.7867647
## 4 eng_4 0.6300366
## 5 eng_5 0.6167883
# Calculate the average engagement factor favorable score for those with 4-6 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
average_row <- data.frame(
question_number = "average",
favorability_score = average_score
)
favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
## question_number favorability_score
## 1 eng_1 0.9014599
## 2 eng_2 0.8534799
## 3 eng_3 0.7867647
## 4 eng_4 0.6300366
## 5 eng_5 0.6167883
## 6 average 0.7577059
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100
print(favorability_score_with_average_percent)
## question_number favorability_score
## 1 eng_1 90
## 2 eng_2 85
## 3 eng_3 79
## 4 eng_4 63
## 5 eng_5 62
## 6 average 76
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")
# graph showing engagement factor scores for the 4-6 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 4-6 years Tenure") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "4-6 Years"
- g. engagement factor scores for employees whose tenure are "6-10 years".
N = 163
#6-10 years
df_2[df_2$tenure_group == "6-10 years" & !is.na(df_2$tenure_group), ]
## # A tibble: 163 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M017… 4 4 4 4 4 3 4 4 4 4 4 4
## 2 M016… 5 5 5 5 4 5 5 5 4 3 4 5
## 3 M021… 5 5 4 5 4 5 5 5 4 5 5 5
## 4 M025… 4 4 4 4 4 4 5 5 3 3 4 4
## 5 M013… 4 3 3 3 3 3 4 3 3 2 2 4
## 6 M022… 3 3 4 5 3 3 4 3 3 2 2 4
## 7 M016… 5 5 4 5 5 5 5 5 5 5 5 5
## 8 M016… 4 4 3 4 3 4 4 4 3 3 3 4
## 9 M011… 4 NA NA 4 4 NA 4 4 4 4 3 4
## 10 M011… 5 5 5 4 3 3 4 5 4 4 4 4
## # ℹ 153 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "6-10 years", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9202454
## 2 eng_2 0.8588957
## 3 eng_3 0.7361963
## 4 eng_4 0.6604938
## 5 eng_5 0.7080745
# Calculate the average engagement factor favorable score for those with 6-10 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
average_row <- data.frame(
question_number = "average",
favorability_score = average_score
)
favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
## question_number favorability_score
## 1 eng_1 0.9202454
## 2 eng_2 0.8588957
## 3 eng_3 0.7361963
## 4 eng_4 0.6604938
## 5 eng_5 0.7080745
## 6 average 0.7767812
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100
print(favorability_score_with_average_percent)
## question_number favorability_score
## 1 eng_1 92
## 2 eng_2 86
## 3 eng_3 74
## 4 eng_4 66
## 5 eng_5 71
## 6 average 78
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")
# graph showing engagement factor scores for the 6-10 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 6-10 years Tenure") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "6-10 Years"
#it is critical that there are only 2 employees who have worked longer than 10 years.
#10+ years
df_2[df_2$tenure_group == "10+ years" & !is.na(df_2$tenure_group), ]
## # A tibble: 4 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01243 4 4 4 4 4 4 5 5 5 4 4 5
## 2 M01936 5 5 3 5 5 5 5 5 5 4 5 5
## 3 M01361 5 5 4 5 4 3 5 5 5 5 5 5
## 4 M02343 5 5 4 5 5 5 5 5 5 4 3 3
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "10+ years", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 1.00
## 2 eng_2 1.00
## 3 eng_3 1.00
## 4 eng_4 1.00
## 5 eng_5 0.75
# Calculate the average engagement factor favorable score for those with over 10 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)
average_row <- data.frame(
question_number = "average",
favorability_score = average_score
)
favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
## question_number favorability_score
## 1 eng_1 1.00
## 2 eng_2 1.00
## 3 eng_3 1.00
## 4 eng_4 1.00
## 5 eng_5 0.75
## 6 average 0.95
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100
print(favorability_score_with_average_percent)
## question_number favorability_score
## 1 eng_1 100
## 2 eng_2 100
## 3 eng_3 100
## 4 eng_4 100
## 5 eng_5 75
## 6 average 95
#visualize the favorability scores
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Favorability Scores for Employees with Over 10 Years Tenure") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5)
- i. Engagement factor favorable scores across tenure groups (employee life cycle)
engagement_by_tenure$favorability_score <- round(engagement_by_tenure$favorability_score*100)
engagement_by_tenure$question_number <- factor(engagement_by_tenure$question_number, levels = engagement_by_tenure$question_number)
engagement_by_tenure
## question_number favorability_score
## 1 Overall 80
## 2 Under 3 Months 89
## 3 3-6 Months 88
## 4 6-12 Months 80
## 5 1-2 Years 79
## 6 2-4 Years 77
## 7 4-6 Years 76
## 8 6-10 Years 78
# Distinguish the average row from other item rows
engagement_by_tenure$highlight <- ifelse(engagement_by_tenure$question_number == engagement_by_tenure$question_number[1], "first", "other")
# graph showing engagement factor scores across tenure groups
ggplot(engagement_by_tenure, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores Across Tenure Groups") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 11),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender)
##
## Female Male
## 820 1830
df_2[df_2$gender== "Female" & !is.na(df_2$gender), ]
## # A tibble: 820 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M017… 4 4 3 4 4 4 4 4 3 4 4 4
## 2 M010… 5 5 5 5 4 5 5 5 4 5 3 5
## 3 M001… 5 5 3 5 3 5 5 5 5 4 5 5
## 4 M011… 4 5 4 4 4 5 5 5 5 5 5 4
## 5 M011… 5 4 4 5 2 5 5 4 5 2 3 5
## 6 M002… 4 4 5 4 3 5 5 5 4 3 4 5
## 7 M008… 5 5 5 5 5 5 5 5 5 5 5 5
## 8 M007… 4 1 1 3 2 5 4 2 3 2 2 2
## 9 M024… 5 5 4 3 5 5 5 5 3 2 3 5
## 10 M007… 4 4 4 4 4 4 5 4 4 5 4 5
## # ℹ 810 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$gender == "Female", 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8441718
## 2 ali_2 0.7980296
## 3 ali_3 0.6584464
## 4 col_1 0.7610294
## 5 col_2 0.6022727
## 6 col_3 0.8204182
## 7 eng_1 0.8974359
## 8 eng_2 0.8095238
## 9 eng_3 0.7533742
## 10 eng_4 0.6328029
## 11 eng_5 0.6813725
## 12 inc_1 0.8620269
## 13 inc_2 0.6492537
## 14 inc_3 0.7382134
## 15 inc_4 0.6509317
## 16 inc_5 0.7719950
## 17 lea_1 0.8019680
## 18 lea_2 0.8009828
## 19 lea_3 0.8946078
## 20 lea_4 0.8378378
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 84 Alignment
## 2 ali_2 80 Alignment
## 3 ali_3 66 Alignment
## 4 col_1 76 Collaboration
## 5 col_2 60 Collaboration
## 6 col_3 82 Collaboration
## 7 eng_1 90 Engagement
## 8 eng_2 81 Engagement
## 9 eng_3 75 Engagement
## 10 eng_4 63 Engagement
## 11 eng_5 68 Engagement
## 12 inc_1 86 Inclusion
## 13 inc_2 65 Inclusion
## 14 inc_3 74 Inclusion
## 15 inc_4 65 Inclusion
## 16 inc_5 77 Inclusion
## 17 lea_1 80 Leadership
## 18 lea_2 80 Leadership
## 19 lea_3 89 Leadership
## 20 lea_4 84 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.7668826
## 2 col 0.7279068
## 3 eng 0.7549019
## 4 inc 0.7344841
## 5 lea 0.8338491
#change the favorability scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 77 Alignment
## 1 ali_1 84 Alignment
## 2 ali_2 80 Alignment
## 3 ali_3 66 Alignment
## 22 col 73 Collaboration
## 4 col_1 76 Collaboration
## 5 col_2 60 Collaboration
## 6 col_3 82 Collaboration
## 23 eng 75 Engagement
## 7 eng_1 90 Engagement
## 8 eng_2 81 Engagement
## 9 eng_3 75 Engagement
## 10 eng_4 63 Engagement
## 11 eng_5 68 Engagement
## 24 inc 73 Inclusion
## 12 inc_1 86 Inclusion
## 13 inc_2 65 Inclusion
## 14 inc_3 74 Inclusion
## 15 inc_4 65 Inclusion
## 16 inc_5 77 Inclusion
## 25 lea 83 Leadership
## 17 lea_1 80 Leadership
## 18 lea_2 80 Leadership
## 19 lea_3 89 Leadership
## 20 lea_4 84 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "Overall",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 Overall 0.7549019
#create a graph that shows engagement favorable score across tenure groups (all)
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 75 Engagement
## 7 eng_1 90 Engagement
## 8 eng_2 81 Engagement
## 9 eng_3 75 Engagement
## 10 eng_4 63 Engagement
## 11 eng_5 68 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
#Create a ggplot that shows engagement factor favorable scores for women
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
- b. favorable scores for all questions and factors for men
N = 1830
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$gender == "Male", 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.9053385
## 2 ali_2 0.8484013
## 3 ali_3 0.6958678
## 4 col_1 0.8517298
## 5 col_2 0.6007861
## 6 col_3 0.7991170
## 7 eng_1 0.9315068
## 8 eng_2 0.8691128
## 9 eng_3 0.8222710
## 10 eng_4 0.7090411
## 11 eng_5 0.7404162
## 12 inc_1 0.9111479
## 13 inc_2 0.7589134
## 14 inc_3 0.8450390
## 15 inc_4 0.7216611
## 16 inc_5 0.7761693
## 17 lea_1 0.8475509
## 18 lea_2 0.8706659
## 19 lea_3 0.8950549
## 20 lea_4 0.8949973
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
print(favorability_score_percent)
## question_number favorability_score
## 1 ali_1 91
## 2 ali_2 85
## 3 ali_3 70
## 4 col_1 85
## 5 col_2 60
## 6 col_3 80
## 7 eng_1 93
## 8 eng_2 87
## 9 eng_3 82
## 10 eng_4 71
## 11 eng_5 74
## 12 inc_1 91
## 13 inc_2 76
## 14 inc_3 85
## 15 inc_4 72
## 16 inc_5 78
## 17 lea_1 85
## 18 lea_2 87
## 19 lea_3 90
## 20 lea_4 89
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 91 Alignment
## 2 ali_2 85 Alignment
## 3 ali_3 70 Alignment
## 4 col_1 85 Collaboration
## 5 col_2 60 Collaboration
## 6 col_3 80 Collaboration
## 7 eng_1 93 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 82 Engagement
## 10 eng_4 71 Engagement
## 11 eng_5 74 Engagement
## 12 inc_1 91 Inclusion
## 13 inc_2 76 Inclusion
## 14 inc_3 85 Inclusion
## 15 inc_4 72 Inclusion
## 16 inc_5 78 Inclusion
## 17 lea_1 85 Leadership
## 18 lea_2 87 Leadership
## 19 lea_3 90 Leadership
## 20 lea_4 89 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8165359
## 2 col 0.7505443
## 3 eng 0.8144696
## 4 inc 0.8025861
## 5 lea 0.8770673
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 82 Alignment
## 1 ali_1 91 Alignment
## 2 ali_2 85 Alignment
## 3 ali_3 70 Alignment
## 22 col 75 Collaboration
## 4 col_1 85 Collaboration
## 5 col_2 60 Collaboration
## 6 col_3 80 Collaboration
## 23 eng 81 Engagement
## 7 eng_1 93 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 82 Engagement
## 10 eng_4 71 Engagement
## 11 eng_5 74 Engagement
## 24 inc 80 Inclusion
## 12 inc_1 91 Inclusion
## 13 inc_2 76 Inclusion
## 14 inc_3 85 Inclusion
## 15 inc_4 72 Inclusion
## 16 inc_5 78 Inclusion
## 25 lea 88 Leadership
## 17 lea_1 85 Leadership
## 18 lea_2 87 Leadership
## 19 lea_3 90 Leadership
## 20 lea_4 89 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "Overall",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 Overall 0.8144696
#create a graph that shows engagement favorable score across tenure groups (all)
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 81 Engagement
## 7 eng_1 93 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 82 Engagement
## 10 eng_4 71 Engagement
## 11 eng_5 74 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
# create a graph that shows engagement factor favorable scores for men
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
df_2[
df_2$gender == "Female" &
df_2$tenure_group == "Under 3 months" &
!is.na(df_2$gender), ]
## # A tibble: 25 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M016… 5 5 4 4 3 4 5 5 5 4 5 4
## 2 M017… 4 5 4 4 4 5 5 5 5 5 5 4
## 3 M016… 4 4 3 4 4 4 4 3 4 4 4 3
## 4 M009… 4 4 4 5 5 5 4 4 4 4 4 5
## 5 M023… 3 5 5 4 4 4 5 5 5 5 3 4
## 6 M000… 3 5 5 5 3 5 5 5 5 5 3 5
## 7 M020… 3 5 5 5 5 5 5 5 5 5 5 5
## 8 M021… 4 5 5 5 NA 5 5 5 4 5 5 5
## 9 M022… 4 5 3 4 3 5 5 5 4 5 5 5
## 10 M006… 5 5 4 4 NA 2 5 5 5 5 5 5
## # ℹ 15 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Female" &
df_2$tenure_group == "Under 3 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.6000000
## 2 ali_2 0.8800000
## 3 ali_3 0.6800000
## 4 col_1 0.8800000
## 5 col_2 0.6500000
## 6 col_3 0.7916667
## 7 eng_1 0.9600000
## 8 eng_2 0.8000000
## 9 eng_3 0.9200000
## 10 eng_4 0.9600000
## 11 eng_5 0.8000000
## 12 inc_1 0.9200000
## 13 inc_2 0.7826087
## 14 inc_3 0.9200000
## 15 inc_4 0.7600000
## 16 inc_5 0.9200000
## 17 lea_1 0.9600000
## 18 lea_2 0.9200000
## 19 lea_3 0.9600000
## 20 lea_4 0.9600000
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 60 Alignment
## 2 ali_2 88 Alignment
## 3 ali_3 68 Alignment
## 4 col_1 88 Collaboration
## 5 col_2 65 Collaboration
## 6 col_3 79 Collaboration
## 7 eng_1 96 Engagement
## 8 eng_2 80 Engagement
## 9 eng_3 92 Engagement
## 10 eng_4 96 Engagement
## 11 eng_5 80 Engagement
## 12 inc_1 92 Inclusion
## 13 inc_2 78 Inclusion
## 14 inc_3 92 Inclusion
## 15 inc_4 76 Inclusion
## 16 inc_5 92 Inclusion
## 17 lea_1 96 Leadership
## 18 lea_2 92 Leadership
## 19 lea_3 96 Leadership
## 20 lea_4 96 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.7200000
## 2 col 0.7738889
## 3 eng 0.8880000
## 4 inc 0.8605217
## 5 lea 0.9500000
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 72 Alignment
## 1 ali_1 60 Alignment
## 2 ali_2 88 Alignment
## 3 ali_3 68 Alignment
## 22 col 77 Collaboration
## 4 col_1 88 Collaboration
## 5 col_2 65 Collaboration
## 6 col_3 79 Collaboration
## 23 eng 89 Engagement
## 7 eng_1 96 Engagement
## 8 eng_2 80 Engagement
## 9 eng_3 92 Engagement
## 10 eng_4 96 Engagement
## 11 eng_5 80 Engagement
## 24 inc 86 Inclusion
## 12 inc_1 92 Inclusion
## 13 inc_2 78 Inclusion
## 14 inc_3 92 Inclusion
## 15 inc_4 76 Inclusion
## 16 inc_5 92 Inclusion
## 25 lea 95 Leadership
## 17 lea_1 96 Leadership
## 18 lea_2 92 Leadership
## 19 lea_3 96 Leadership
## 20 lea_4 96 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure under 3 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure under 3 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "Under 3 months",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 Under 3 months 0.888
#create a graph that shows engagement favorable scores for women with tenure under 3 months
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 89 Engagement
## 7 eng_1 96 Engagement
## 8 eng_2 80 Engagement
## 9 eng_3 92 Engagement
## 10 eng_4 96 Engagement
## 11 eng_5 80 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure Under 3 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Female" &
df_2$tenure_group == "3-6 months" &
!is.na(df_2$gender), ]
## # A tibble: 116 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M001… 5 5 3 5 3 5 5 5 5 4 5 5
## 2 M017… 3 4 3 4 3 5 5 5 5 4 5 2
## 3 M019… 5 5 5 5 5 5 5 5 5 5 5 5
## 4 M026… 4 4 4 4 4 4 5 5 5 5 4 4
## 5 M025… 5 5 4 5 5 5 5 5 5 5 5 5
## 6 M002… 4 3 4 3 4 4 5 3 4 4 3 4
## 7 M015… 3 3 3 3 3 3 4 4 4 4 4 4
## 8 M024… 4 4 4 4 4 4 4 5 4 4 5 5
## 9 M004… 5 5 5 5 5 5 5 5 5 5 5 5
## 10 M010… 4 4 4 4 5 4 4 2 3 3 3 3
## # ℹ 106 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Female" &
df_2$tenure_group == "3-6 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.9130435
## 2 ali_2 0.8782609
## 3 ali_3 0.7739130
## 4 col_1 0.8534483
## 5 col_2 0.5904762
## 6 col_3 0.9130435
## 7 eng_1 0.9396552
## 8 eng_2 0.9130435
## 9 eng_3 0.8448276
## 10 eng_4 0.7931034
## 11 eng_5 0.8448276
## 12 inc_1 0.8782609
## 13 inc_2 0.6725664
## 14 inc_3 0.7894737
## 15 inc_4 0.7433628
## 16 inc_5 0.7982456
## 17 lea_1 0.8347826
## 18 lea_2 0.8956522
## 19 lea_3 0.9391304
## 20 lea_4 0.8608696
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 91 Alignment
## 2 ali_2 88 Alignment
## 3 ali_3 77 Alignment
## 4 col_1 85 Collaboration
## 5 col_2 59 Collaboration
## 6 col_3 91 Collaboration
## 7 eng_1 94 Engagement
## 8 eng_2 91 Engagement
## 9 eng_3 84 Engagement
## 10 eng_4 79 Engagement
## 11 eng_5 84 Engagement
## 12 inc_1 88 Inclusion
## 13 inc_2 67 Inclusion
## 14 inc_3 79 Inclusion
## 15 inc_4 74 Inclusion
## 16 inc_5 80 Inclusion
## 17 lea_1 83 Leadership
## 18 lea_2 90 Leadership
## 19 lea_3 94 Leadership
## 20 lea_4 86 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8550725
## 2 col 0.7856560
## 3 eng 0.8670915
## 4 inc 0.7763819
## 5 lea 0.8826087
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 86 Alignment
## 1 ali_1 91 Alignment
## 2 ali_2 88 Alignment
## 3 ali_3 77 Alignment
## 22 col 79 Collaboration
## 4 col_1 85 Collaboration
## 5 col_2 59 Collaboration
## 6 col_3 91 Collaboration
## 23 eng 87 Engagement
## 7 eng_1 94 Engagement
## 8 eng_2 91 Engagement
## 9 eng_3 84 Engagement
## 10 eng_4 79 Engagement
## 11 eng_5 84 Engagement
## 24 inc 78 Inclusion
## 12 inc_1 88 Inclusion
## 13 inc_2 67 Inclusion
## 14 inc_3 79 Inclusion
## 15 inc_4 74 Inclusion
## 16 inc_5 80 Inclusion
## 25 lea 88 Leadership
## 17 lea_1 83 Leadership
## 18 lea_2 90 Leadership
## 19 lea_3 94 Leadership
## 20 lea_4 86 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 3-6 months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 3-6 months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "3-6 months",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 3-6 months 0.8670915
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 87 Engagement
## 7 eng_1 94 Engagement
## 8 eng_2 91 Engagement
## 9 eng_3 84 Engagement
## 10 eng_4 79 Engagement
## 11 eng_5 84 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 3-6 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
- f. favorable scores for all questions for women with tenure between
6-12 months N = 181
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Female" &
df_2$tenure_group == "6-12 months" &
!is.na(df_2$gender), ]
## # A tibble: 181 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M011… 4 5 4 4 4 5 5 5 5 5 5 4
## 2 M011… 5 4 4 5 2 5 5 4 5 2 3 5
## 3 M007… 4 4 4 4 4 4 5 4 4 5 4 5
## 4 M012… 4 2 2 4 5 3 4 3 4 3 3 5
## 5 M012… 3 3 2 3 3 3 3 3 1 3 3 3
## 6 M013… 4 5 5 4 3 5 4 3 3 4 3 5
## 7 M012… 5 5 5 5 4 5 5 5 5 5 5 5
## 8 M016… 5 4 4 2 2 5 3 3 2 2 3 5
## 9 M015… 4 4 3 5 3 3 4 5 4 3 3 4
## 10 M013… 4 4 4 NA 3 2 4 3 4 3 4 4
## # ℹ 171 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Female" &
df_2$tenure_group == "6-12 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8166667
## 2 ali_2 0.7777778
## 3 ali_3 0.7111111
## 4 col_1 0.7262570
## 5 col_2 0.5674157
## 6 col_3 0.7734807
## 7 eng_1 0.8950276
## 8 eng_2 0.7734807
## 9 eng_3 0.7932961
## 10 eng_4 0.7071823
## 11 eng_5 0.6944444
## 12 inc_1 0.8618785
## 13 inc_2 0.6944444
## 14 inc_3 0.8100559
## 15 inc_4 0.6815642
## 16 inc_5 0.7888889
## 17 lea_1 0.7944444
## 18 lea_2 0.8333333
## 19 lea_3 0.8674033
## 20 lea_4 0.8287293
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 82 Alignment
## 2 ali_2 78 Alignment
## 3 ali_3 71 Alignment
## 4 col_1 73 Collaboration
## 5 col_2 57 Collaboration
## 6 col_3 77 Collaboration
## 7 eng_1 90 Engagement
## 8 eng_2 77 Engagement
## 9 eng_3 79 Engagement
## 10 eng_4 71 Engagement
## 11 eng_5 69 Engagement
## 12 inc_1 86 Inclusion
## 13 inc_2 69 Inclusion
## 14 inc_3 81 Inclusion
## 15 inc_4 68 Inclusion
## 16 inc_5 79 Inclusion
## 17 lea_1 79 Leadership
## 18 lea_2 83 Leadership
## 19 lea_3 87 Leadership
## 20 lea_4 83 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.7685185
## 2 col 0.6890511
## 3 eng 0.7726862
## 4 inc 0.7673664
## 5 lea 0.8309776
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 77 Alignment
## 1 ali_1 82 Alignment
## 2 ali_2 78 Alignment
## 3 ali_3 71 Alignment
## 22 col 69 Collaboration
## 4 col_1 73 Collaboration
## 5 col_2 57 Collaboration
## 6 col_3 77 Collaboration
## 23 eng 77 Engagement
## 7 eng_1 90 Engagement
## 8 eng_2 77 Engagement
## 9 eng_3 79 Engagement
## 10 eng_4 71 Engagement
## 11 eng_5 69 Engagement
## 24 inc 77 Inclusion
## 12 inc_1 86 Inclusion
## 13 inc_2 69 Inclusion
## 14 inc_3 81 Inclusion
## 15 inc_4 68 Inclusion
## 16 inc_5 79 Inclusion
## 25 lea 83 Leadership
## 17 lea_1 79 Leadership
## 18 lea_2 83 Leadership
## 19 lea_3 87 Leadership
## 20 lea_4 83 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-12 months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-12 months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "6-12 months",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 6-12 months 0.7726862
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 77 Engagement
## 7 eng_1 90 Engagement
## 8 eng_2 77 Engagement
## 9 eng_3 79 Engagement
## 10 eng_4 71 Engagement
## 11 eng_5 69 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 6 and 12 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Female" &
df_2$tenure_group == "1-2 years" &
!is.na(df_2$gender), ]
## # A tibble: 211 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M017… 4 4 3 4 4 4 4 4 3 4 4 4
## 2 M002… 4 4 5 4 3 5 5 5 4 3 4 5
## 3 M007… 4 1 1 3 2 5 4 2 3 2 2 2
## 4 M016… 5 5 4 5 5 5 5 5 5 3 3 5
## 5 M012… 4 4 3 3 3 3 3 4 4 3 4 3
## 6 M009… 4 3 3 2 3 4 4 3 3 3 3 4
## 7 M009… 3 3 4 2 3 4 4 4 3 2 3 4
## 8 M024… 5 5 5 5 5 5 5 5 5 5 5 5
## 9 M022… 5 5 5 5 5 5 5 5 5 5 5 5
## 10 M019… 4 5 3 3 3 5 5 5 5 3 4 3
## # ℹ 201 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Female" &
df_2$tenure_group == "1-2 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8666667
## 2 ali_2 0.7809524
## 3 ali_3 0.6507177
## 4 col_1 0.7772512
## 5 col_2 0.6201923
## 6 col_3 0.8373206
## 7 eng_1 0.8904762
## 8 eng_2 0.8293839
## 9 eng_3 0.7320574
## 10 eng_4 0.5971564
## 11 eng_5 0.6952381
## 12 inc_1 0.9004739
## 13 inc_2 0.6666667
## 14 inc_3 0.7464115
## 15 inc_4 0.6411483
## 16 inc_5 0.8038278
## 17 lea_1 0.8173077
## 18 lea_2 0.8104265
## 19 lea_3 0.9238095
## 20 lea_4 0.8619048
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 87 Alignment
## 2 ali_2 78 Alignment
## 3 ali_3 65 Alignment
## 4 col_1 78 Collaboration
## 5 col_2 62 Collaboration
## 6 col_3 84 Collaboration
## 7 eng_1 89 Engagement
## 8 eng_2 83 Engagement
## 9 eng_3 73 Engagement
## 10 eng_4 60 Engagement
## 11 eng_5 70 Engagement
## 12 inc_1 90 Inclusion
## 13 inc_2 67 Inclusion
## 14 inc_3 75 Inclusion
## 15 inc_4 64 Inclusion
## 16 inc_5 80 Inclusion
## 17 lea_1 82 Leadership
## 18 lea_2 81 Leadership
## 19 lea_3 92 Leadership
## 20 lea_4 86 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.7661123
## 2 col 0.7449214
## 3 eng 0.7488624
## 4 inc 0.7517056
## 5 lea 0.8533621
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 77 Alignment
## 1 ali_1 87 Alignment
## 2 ali_2 78 Alignment
## 3 ali_3 65 Alignment
## 22 col 74 Collaboration
## 4 col_1 78 Collaboration
## 5 col_2 62 Collaboration
## 6 col_3 84 Collaboration
## 23 eng 75 Engagement
## 7 eng_1 89 Engagement
## 8 eng_2 83 Engagement
## 9 eng_3 73 Engagement
## 10 eng_4 60 Engagement
## 11 eng_5 70 Engagement
## 24 inc 75 Inclusion
## 12 inc_1 90 Inclusion
## 13 inc_2 67 Inclusion
## 14 inc_3 75 Inclusion
## 15 inc_4 64 Inclusion
## 16 inc_5 80 Inclusion
## 25 lea 85 Leadership
## 17 lea_1 82 Leadership
## 18 lea_2 81 Leadership
## 19 lea_3 92 Leadership
## 20 lea_4 86 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 1-2 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 1-2 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "1-2 years",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 1-2 years 0.7488624
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 75 Engagement
## 7 eng_1 89 Engagement
## 8 eng_2 83 Engagement
## 9 eng_3 73 Engagement
## 10 eng_4 60 Engagement
## 11 eng_5 70 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 1-2 Years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Female" &
df_2$tenure_group == "2-4 years" &
!is.na(df_2$gender), ]
## # A tibble: 191 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M010… 5 5 5 5 4 5 5 5 4 5 3 5
## 2 M008… 5 5 5 5 5 5 5 5 5 5 5 5
## 3 M024… 5 5 4 3 5 5 5 5 3 2 3 5
## 4 M021… 3 5 4 2 5 5 3 2 3 2 3 3
## 5 M022… 4 2 4 4 2 1 3 3 5 3 3 4
## 6 M020… 4 4 4 NA NA 4 3 3 2 NA 4 2
## 7 M013… 5 5 5 5 5 5 5 2 3 5 5 5
## 8 M024… 5 5 3 4 3 3 5 5 5 5 5 3
## 9 M001… 4 4 2 4 3 4 4 4 4 3 3 1
## 10 M009… 4 3 4 4 3 4 4 4 3 2 2 3
## # ℹ 181 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Female" &
df_2$tenure_group == "2-4 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8473684
## 2 ali_2 0.7765957
## 3 ali_3 0.6137566
## 4 col_1 0.7421053
## 5 col_2 0.6170213
## 6 col_3 0.8315789
## 7 eng_1 0.8900524
## 8 eng_2 0.7748691
## 9 eng_3 0.6910995
## 10 eng_4 0.5238095
## 11 eng_5 0.6000000
## 12 inc_1 0.8167539
## 13 inc_2 0.5828877
## 14 inc_3 0.6256684
## 15 inc_4 0.6276596
## 16 inc_5 0.7393617
## 17 lea_1 0.7801047
## 18 lea_2 0.7315789
## 19 lea_3 0.8691099
## 20 lea_4 0.8115183
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 85 Alignment
## 2 ali_2 78 Alignment
## 3 ali_3 61 Alignment
## 4 col_1 74 Collaboration
## 5 col_2 62 Collaboration
## 6 col_3 83 Collaboration
## 7 eng_1 89 Engagement
## 8 eng_2 77 Engagement
## 9 eng_3 69 Engagement
## 10 eng_4 52 Engagement
## 11 eng_5 60 Engagement
## 12 inc_1 82 Inclusion
## 13 inc_2 58 Inclusion
## 14 inc_3 63 Inclusion
## 15 inc_4 63 Inclusion
## 16 inc_5 74 Inclusion
## 17 lea_1 78 Leadership
## 18 lea_2 73 Leadership
## 19 lea_3 87 Leadership
## 20 lea_4 81 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.7459069
## 2 col 0.7302352
## 3 eng 0.6959661
## 4 inc 0.6784663
## 5 lea 0.7980780
#change the favorability scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 75 Alignment
## 1 ali_1 85 Alignment
## 2 ali_2 78 Alignment
## 3 ali_3 61 Alignment
## 22 col 73 Collaboration
## 4 col_1 74 Collaboration
## 5 col_2 62 Collaboration
## 6 col_3 83 Collaboration
## 23 eng 70 Engagement
## 7 eng_1 89 Engagement
## 8 eng_2 77 Engagement
## 9 eng_3 69 Engagement
## 10 eng_4 52 Engagement
## 11 eng_5 60 Engagement
## 24 inc 68 Inclusion
## 12 inc_1 82 Inclusion
## 13 inc_2 58 Inclusion
## 14 inc_3 63 Inclusion
## 15 inc_4 63 Inclusion
## 16 inc_5 74 Inclusion
## 25 lea 80 Leadership
## 17 lea_1 78 Leadership
## 18 lea_2 73 Leadership
## 19 lea_3 87 Leadership
## 20 lea_4 81 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 2-4 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 2-4 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "2-4 years",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 2-4 years 0.6959661
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 70 Engagement
## 7 eng_1 89 Engagement
## 8 eng_2 77 Engagement
## 9 eng_3 69 Engagement
## 10 eng_4 52 Engagement
## 11 eng_5 60 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 2-4 Years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Female" &
df_2$tenure_group == "4-6 years" &
!is.na(df_2$gender), ]
## # A tibble: 58 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M009… 5 5 4 5 5 5 5 5 5 5 5 5
## 2 M007… 4 4 3 4 5 5 4 4 4 3 3 5
## 3 M022… 2 5 4 3 5 5 4 4 1 4 5 3
## 4 M001… 4 4 2 3 3 5 4 4 3 2 3 5
## 5 M017… 4 5 4 3 4 5 5 5 5 5 5 4
## 6 M002… 4 4 4 3 3 NA 5 4 4 3 3 5
## 7 M018… 5 5 5 5 5 5 5 5 5 5 5 5
## 8 M015… 5 5 5 5 5 5 5 5 5 5 5 5
## 9 M007… 4 4 2 4 4 4 4 4 4 4 3 4
## 10 M018… 4 3 3 3 3 3 4 4 4 4 4 4
## # ℹ 48 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Female" &
df_2$tenure_group == "4-6 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8448276
## 2 ali_2 0.7894737
## 3 ali_3 0.5357143
## 4 col_1 0.6315789
## 5 col_2 0.6140351
## 6 col_3 0.7142857
## 7 eng_1 0.8620690
## 8 eng_2 0.7931034
## 9 eng_3 0.7368421
## 10 eng_4 0.4655172
## 11 eng_5 0.5000000
## 12 inc_1 0.7931034
## 13 inc_2 0.5964912
## 14 inc_3 0.6607143
## 15 inc_4 0.5818182
## 16 inc_5 0.6428571
## 17 lea_1 0.7321429
## 18 lea_2 0.6607143
## 19 lea_3 0.9107143
## 20 lea_4 0.8181818
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 84 Alignment
## 2 ali_2 79 Alignment
## 3 ali_3 54 Alignment
## 4 col_1 63 Collaboration
## 5 col_2 61 Collaboration
## 6 col_3 71 Collaboration
## 7 eng_1 86 Engagement
## 8 eng_2 79 Engagement
## 9 eng_3 74 Engagement
## 10 eng_4 47 Engagement
## 11 eng_5 50 Engagement
## 12 inc_1 79 Inclusion
## 13 inc_2 60 Inclusion
## 14 inc_3 66 Inclusion
## 15 inc_4 58 Inclusion
## 16 inc_5 64 Inclusion
## 17 lea_1 73 Leadership
## 18 lea_2 66 Leadership
## 19 lea_3 91 Leadership
## 20 lea_4 82 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.7233385
## 2 col 0.6532999
## 3 eng 0.6715064
## 4 inc 0.6549969
## 5 lea 0.7804383
#change the favorability scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 72 Alignment
## 1 ali_1 84 Alignment
## 2 ali_2 79 Alignment
## 3 ali_3 54 Alignment
## 22 col 65 Collaboration
## 4 col_1 63 Collaboration
## 5 col_2 61 Collaboration
## 6 col_3 71 Collaboration
## 23 eng 67 Engagement
## 7 eng_1 86 Engagement
## 8 eng_2 79 Engagement
## 9 eng_3 74 Engagement
## 10 eng_4 47 Engagement
## 11 eng_5 50 Engagement
## 24 inc 65 Inclusion
## 12 inc_1 79 Inclusion
## 13 inc_2 60 Inclusion
## 14 inc_3 66 Inclusion
## 15 inc_4 58 Inclusion
## 16 inc_5 64 Inclusion
## 25 lea 78 Leadership
## 17 lea_1 73 Leadership
## 18 lea_2 66 Leadership
## 19 lea_3 91 Leadership
## 20 lea_4 82 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 4-6 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 4-6 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "4-6 years",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 4-6 years 0.6715064
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 67 Engagement
## 7 eng_1 86 Engagement
## 8 eng_2 79 Engagement
## 9 eng_3 74 Engagement
## 10 eng_4 47 Engagement
## 11 eng_5 50 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 4-6 Years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Female" &
df_2$tenure_group == "6-10 years" &
!is.na(df_2$gender), ]
## # A tibble: 37 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M016… 5 5 5 5 4 5 5 5 4 3 4 5
## 2 M013… 4 3 3 3 3 3 4 3 3 2 2 4
## 3 M024… 5 5 3 5 5 5 4 4 4 3 4 4
## 4 M006… 5 5 5 5 3 4 5 5 5 5 4 5
## 5 M016… 5 5 3 4 4 4 5 5 4 4 4 5
## 6 M011… 5 5 5 4 5 5 5 4 4 5 5 5
## 7 M004… 5 5 4 4 3 4 5 5 5 4 4 5
## 8 M014… NA NA NA 5 NA 2 4 4 4 3 4 5
## 9 M021… 4 4 3 4 3 4 4 4 3 3 4 4
## 10 M010… 1 1 1 1 1 3 2 2 1 1 1 1
## # ℹ 27 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Female" &
df_2$tenure_group == "6-10 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.7777778
## 2 ali_2 0.8055556
## 3 ali_3 0.4722222
## 4 col_1 0.7567568
## 5 col_2 0.5714286
## 6 col_3 0.7837838
## 7 eng_1 0.8648649
## 8 eng_2 0.7567568
## 9 eng_3 0.6216216
## 10 eng_4 0.5555556
## 11 eng_5 0.6666667
## 12 inc_1 0.9189189
## 13 inc_2 0.6111111
## 14 inc_3 0.7714286
## 15 inc_4 0.4285714
## 16 inc_5 0.6764706
## 17 lea_1 0.7567568
## 18 lea_2 0.7777778
## 19 lea_3 0.7837838
## 20 lea_4 0.7500000
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 78 Alignment
## 2 ali_2 81 Alignment
## 3 ali_3 47 Alignment
## 4 col_1 76 Collaboration
## 5 col_2 57 Collaboration
## 6 col_3 78 Collaboration
## 7 eng_1 86 Engagement
## 8 eng_2 76 Engagement
## 9 eng_3 62 Engagement
## 10 eng_4 56 Engagement
## 11 eng_5 67 Engagement
## 12 inc_1 92 Inclusion
## 13 inc_2 61 Inclusion
## 14 inc_3 77 Inclusion
## 15 inc_4 43 Inclusion
## 16 inc_5 68 Inclusion
## 17 lea_1 76 Leadership
## 18 lea_2 78 Leadership
## 19 lea_3 78 Leadership
## 20 lea_4 75 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.6851852
## 2 col 0.7039897
## 3 eng 0.6930931
## 4 inc 0.6813001
## 5 lea 0.7670796
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 69 Alignment
## 1 ali_1 78 Alignment
## 2 ali_2 81 Alignment
## 3 ali_3 47 Alignment
## 22 col 70 Collaboration
## 4 col_1 76 Collaboration
## 5 col_2 57 Collaboration
## 6 col_3 78 Collaboration
## 23 eng 69 Engagement
## 7 eng_1 86 Engagement
## 8 eng_2 76 Engagement
## 9 eng_3 62 Engagement
## 10 eng_4 56 Engagement
## 11 eng_5 67 Engagement
## 24 inc 68 Inclusion
## 12 inc_1 92 Inclusion
## 13 inc_2 61 Inclusion
## 14 inc_3 77 Inclusion
## 15 inc_4 43 Inclusion
## 16 inc_5 68 Inclusion
## 25 lea 77 Leadership
## 17 lea_1 76 Leadership
## 18 lea_2 78 Leadership
## 19 lea_3 78 Leadership
## 20 lea_4 75 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-10 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-10 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "6-10 years",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 6-10 years 0.6930931
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 69 Engagement
## 7 eng_1 86 Engagement
## 8 eng_2 76 Engagement
## 9 eng_3 62 Engagement
## 10 eng_4 56 Engagement
## 11 eng_5 67 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 6-10 Years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
df_2[
df_2$gender == "Male" &
df_2$tenure_group == "Under 3 months" &
!is.na(df_2$gender), ]
## # A tibble: 39 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M014… 5 5 4 5 3 1 5 5 5 5 5 5
## 2 M004… 3 4 5 5 5 5 4 4 5 5 4 5
## 3 M005… 4 4 4 4 4 4 5 5 5 4 5 4
## 4 M021… 4 4 4 4 3 3 5 5 4 5 5 4
## 5 M020… 5 5 5 5 5 5 5 4 5 5 5 5
## 6 M002… 4 4 4 4 4 4 4 4 4 4 4 5
## 7 M024… 4 4 3 4 3 3 5 1 5 3 4 4
## 8 M008… 4 4 4 4 3 4 4 5 3 4 4 4
## 9 M012… 4 4 4 4 3 4 5 4 5 4 4 4
## 10 M024… 5 5 5 5 5 5 5 5 4 5 5 5
## # ℹ 29 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Male" &
df_2$tenure_group == "Under 3 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8461538
## 2 ali_2 0.9487179
## 3 ali_3 0.7948718
## 4 col_1 0.8717949
## 5 col_2 0.5142857
## 6 col_3 0.8717949
## 7 eng_1 1.0000000
## 8 eng_2 0.8974359
## 9 eng_3 0.8974359
## 10 eng_4 0.8461538
## 11 eng_5 0.8461538
## 12 inc_1 0.9487179
## 13 inc_2 0.7222222
## 14 inc_3 0.9210526
## 15 inc_4 0.7777778
## 16 inc_5 0.8974359
## 17 lea_1 0.7948718
## 18 lea_2 0.8974359
## 19 lea_3 0.9230769
## 20 lea_4 0.8205128
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 85 Alignment
## 2 ali_2 95 Alignment
## 3 ali_3 79 Alignment
## 4 col_1 87 Collaboration
## 5 col_2 51 Collaboration
## 6 col_3 87 Collaboration
## 7 eng_1 100 Engagement
## 8 eng_2 90 Engagement
## 9 eng_3 90 Engagement
## 10 eng_4 85 Engagement
## 11 eng_5 85 Engagement
## 12 inc_1 95 Inclusion
## 13 inc_2 72 Inclusion
## 14 inc_3 92 Inclusion
## 15 inc_4 78 Inclusion
## 16 inc_5 90 Inclusion
## 17 lea_1 79 Leadership
## 18 lea_2 90 Leadership
## 19 lea_3 92 Leadership
## 20 lea_4 82 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8632479
## 2 col 0.7526252
## 3 eng 0.8974359
## 4 inc 0.8534413
## 5 lea 0.8589744
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 86 Alignment
## 1 ali_1 85 Alignment
## 2 ali_2 95 Alignment
## 3 ali_3 79 Alignment
## 22 col 75 Collaboration
## 4 col_1 87 Collaboration
## 5 col_2 51 Collaboration
## 6 col_3 87 Collaboration
## 23 eng 90 Engagement
## 7 eng_1 100 Engagement
## 8 eng_2 90 Engagement
## 9 eng_3 90 Engagement
## 10 eng_4 85 Engagement
## 11 eng_5 85 Engagement
## 24 inc 85 Inclusion
## 12 inc_1 95 Inclusion
## 13 inc_2 72 Inclusion
## 14 inc_3 92 Inclusion
## 15 inc_4 78 Inclusion
## 16 inc_5 90 Inclusion
## 25 lea 86 Leadership
## 17 lea_1 79 Leadership
## 18 lea_2 90 Leadership
## 19 lea_3 92 Leadership
## 20 lea_4 82 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure under 3 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure under 3 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "Under 3 months",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 Under 3 months 0.8974359
#create a graph that shows engagement favorable score across tenure groups
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 90 Engagement
## 7 eng_1 100 Engagement
## 8 eng_2 90 Engagement
## 9 eng_3 90 Engagement
## 10 eng_4 85 Engagement
## 11 eng_5 85 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure Under 3 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
- j. favorable scores for all questions for Men with tenure between 3-6
months N = 116
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Male" &
df_2$tenure_group == "3-6 months" &
!is.na(df_2$gender), ]
## # A tibble: 209 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M003… 5 5 4 5 4 5 5 5 5 4 3 5
## 2 M002… 5 5 5 5 5 5 5 5 5 4 4 5
## 3 M021… 5 4 4 4 3 5 5 3 4 2 4 5
## 4 M024… 5 4 2 4 3 3 4 5 4 3 3 4
## 5 M014… 4 4 4 4 4 4 4 2 3 2 4 3
## 6 M017… 5 5 5 5 5 5 5 5 5 5 5 5
## 7 M020… 4 4 3 4 4 4 5 5 4 4 5 5
## 8 M010… 5 5 4 5 NA NA 5 5 5 5 5 5
## 9 M023… 5 5 4 5 5 5 5 5 5 5 5 5
## 10 M008… 5 5 5 5 5 5 5 5 5 5 5 5
## # ℹ 199 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Male" &
df_2$tenure_group == "3-6 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.9320388
## 2 ali_2 0.8888889
## 3 ali_3 0.7584541
## 4 col_1 0.9033816
## 5 col_2 0.5322581
## 6 col_3 0.8689320
## 7 eng_1 0.9567308
## 8 eng_2 0.8990385
## 9 eng_3 0.8990385
## 10 eng_4 0.8365385
## 11 eng_5 0.8653846
## 12 inc_1 0.9268293
## 13 inc_2 0.7587940
## 14 inc_3 0.8712871
## 15 inc_4 0.7450000
## 16 inc_5 0.7980296
## 17 lea_1 0.8743961
## 18 lea_2 0.9077670
## 19 lea_3 0.9275362
## 20 lea_4 0.9275362
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 93 Alignment
## 2 ali_2 89 Alignment
## 3 ali_3 76 Alignment
## 4 col_1 90 Collaboration
## 5 col_2 53 Collaboration
## 6 col_3 87 Collaboration
## 7 eng_1 96 Engagement
## 8 eng_2 90 Engagement
## 9 eng_3 90 Engagement
## 10 eng_4 84 Engagement
## 11 eng_5 87 Engagement
## 12 inc_1 93 Inclusion
## 13 inc_2 76 Inclusion
## 14 inc_3 87 Inclusion
## 15 inc_4 74 Inclusion
## 16 inc_5 80 Inclusion
## 17 lea_1 87 Leadership
## 18 lea_2 91 Leadership
## 19 lea_3 93 Leadership
## 20 lea_4 93 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8597939
## 2 col 0.7681906
## 3 eng 0.8913462
## 4 inc 0.8199880
## 5 lea 0.9093089
#change the favorability scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 86 Alignment
## 1 ali_1 93 Alignment
## 2 ali_2 89 Alignment
## 3 ali_3 76 Alignment
## 22 col 77 Collaboration
## 4 col_1 90 Collaboration
## 5 col_2 53 Collaboration
## 6 col_3 87 Collaboration
## 23 eng 89 Engagement
## 7 eng_1 96 Engagement
## 8 eng_2 90 Engagement
## 9 eng_3 90 Engagement
## 10 eng_4 84 Engagement
## 11 eng_5 87 Engagement
## 24 inc 82 Inclusion
## 12 inc_1 93 Inclusion
## 13 inc_2 76 Inclusion
## 14 inc_3 87 Inclusion
## 15 inc_4 74 Inclusion
## 16 inc_5 80 Inclusion
## 25 lea 91 Leadership
## 17 lea_1 87 Leadership
## 18 lea_2 91 Leadership
## 19 lea_3 93 Leadership
## 20 lea_4 93 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 3-6 months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 3-6 months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "3-6 months",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 3-6 months 0.8913462
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 89 Engagement
## 7 eng_1 96 Engagement
## 8 eng_2 90 Engagement
## 9 eng_3 90 Engagement
## 10 eng_4 84 Engagement
## 11 eng_5 87 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 3-6 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Male" &
df_2$tenure_group == "6-12 months" &
!is.na(df_2$gender), ]
## # A tibble: 325 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M012… 4 3 4 3 3 3 4 3 4 3 3 4
## 2 M025… 5 5 4 4 5 5 5 4 5 4 5 5
## 3 M007… 4 3 3 5 3 4 4 4 3 3 3 4
## 4 M020… 5 5 5 5 5 5 5 5 4 5 5 5
## 5 M024… 2 4 3 3 4 4 3 1 2 3 2 3
## 6 M019… 5 5 5 5 5 5 5 5 5 5 5 5
## 7 M007… 5 4 3 4 4 3 5 5 5 5 5 3
## 8 M013… 5 5 5 5 5 5 5 5 5 5 5 5
## 9 M023… 5 5 5 5 5 5 4 5 5 4 5 4
## 10 M001… 4 4 4 2 3 3 4 3 5 4 4 4
## # ℹ 315 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Male" &
df_2$tenure_group == "6-12 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8981481
## 2 ali_2 0.8452012
## 3 ali_3 0.6677019
## 4 col_1 0.8664596
## 5 col_2 0.5460317
## 6 col_3 0.8204334
## 7 eng_1 0.9166667
## 8 eng_2 0.8307692
## 9 eng_3 0.8240741
## 10 eng_4 0.7292308
## 11 eng_5 0.7507692
## 12 inc_1 0.9068323
## 13 inc_2 0.7746032
## 14 inc_3 0.8593750
## 15 inc_4 0.6813880
## 16 inc_5 0.7875000
## 17 lea_1 0.8456790
## 18 lea_2 0.8641975
## 19 lea_3 0.9138462
## 20 lea_4 0.8947368
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 90 Alignment
## 2 ali_2 85 Alignment
## 3 ali_3 67 Alignment
## 4 col_1 87 Collaboration
## 5 col_2 55 Collaboration
## 6 col_3 82 Collaboration
## 7 eng_1 92 Engagement
## 8 eng_2 83 Engagement
## 9 eng_3 82 Engagement
## 10 eng_4 73 Engagement
## 11 eng_5 75 Engagement
## 12 inc_1 91 Inclusion
## 13 inc_2 77 Inclusion
## 14 inc_3 86 Inclusion
## 15 inc_4 68 Inclusion
## 16 inc_5 79 Inclusion
## 17 lea_1 85 Leadership
## 18 lea_2 86 Leadership
## 19 lea_3 91 Leadership
## 20 lea_4 89 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8036837
## 2 col 0.7443083
## 3 eng 0.8103020
## 4 inc 0.8019397
## 5 lea 0.8796149
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 80 Alignment
## 1 ali_1 90 Alignment
## 2 ali_2 85 Alignment
## 3 ali_3 67 Alignment
## 22 col 74 Collaboration
## 4 col_1 87 Collaboration
## 5 col_2 55 Collaboration
## 6 col_3 82 Collaboration
## 23 eng 81 Engagement
## 7 eng_1 92 Engagement
## 8 eng_2 83 Engagement
## 9 eng_3 82 Engagement
## 10 eng_4 73 Engagement
## 11 eng_5 75 Engagement
## 24 inc 80 Inclusion
## 12 inc_1 91 Inclusion
## 13 inc_2 77 Inclusion
## 14 inc_3 86 Inclusion
## 15 inc_4 68 Inclusion
## 16 inc_5 79 Inclusion
## 25 lea 88 Leadership
## 17 lea_1 85 Leadership
## 18 lea_2 86 Leadership
## 19 lea_3 91 Leadership
## 20 lea_4 89 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-12 months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-12 months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "6-12 months",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 6-12 months 0.810302
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 81 Engagement
## 7 eng_1 92 Engagement
## 8 eng_2 83 Engagement
## 9 eng_3 82 Engagement
## 10 eng_4 73 Engagement
## 11 eng_5 75 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 6 and 12 Months") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Male" &
df_2$tenure_group == "1-2 years" &
!is.na(df_2$gender), ]
## # A tibble: 459 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M006… 4 4 4 4 4 4 4 4 4 3 3 4
## 2 M014… 5 5 4 5 5 5 5 5 4 3 3 5
## 3 M013… 4 5 4 5 4 5 5 5 5 4 5 5
## 4 M000… 5 4 4 3 5 3 5 5 5 5 5 5
## 5 M020… 5 4 3 5 3 4 5 5 3 4 4 5
## 6 M016… 3 3 2 4 3 4 4 4 4 3 3 3
## 7 M017… 4 4 3 4 4 4 4 4 5 4 4 4
## 8 M012… 5 5 4 5 4 4 4 4 4 5 5 4
## 9 M015… 3 4 4 4 3 5 4 4 4 4 3 4
## 10 M022… 5 5 5 5 4 3 5 5 5 5 5 5
## # ℹ 449 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Male" &
df_2$tenure_group == "1-2 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.9102845
## 2 ali_2 0.8231441
## 3 ali_3 0.6710240
## 4 col_1 0.8518519
## 5 col_2 0.6228070
## 6 col_3 0.8358862
## 7 eng_1 0.9387309
## 8 eng_2 0.8711790
## 9 eng_3 0.8275109
## 10 eng_4 0.6914661
## 11 eng_5 0.7227074
## 12 inc_1 0.9080963
## 13 inc_2 0.7550562
## 14 inc_3 0.8505495
## 15 inc_4 0.7676991
## 16 inc_5 0.7802198
## 17 lea_1 0.8769231
## 18 lea_2 0.8796499
## 19 lea_3 0.8903509
## 20 lea_4 0.9082969
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 91 Alignment
## 2 ali_2 82 Alignment
## 3 ali_3 67 Alignment
## 4 col_1 85 Collaboration
## 5 col_2 62 Collaboration
## 6 col_3 84 Collaboration
## 7 eng_1 94 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 83 Engagement
## 10 eng_4 69 Engagement
## 11 eng_5 72 Engagement
## 12 inc_1 91 Inclusion
## 13 inc_2 76 Inclusion
## 14 inc_3 85 Inclusion
## 15 inc_4 77 Inclusion
## 16 inc_5 78 Inclusion
## 17 lea_1 88 Leadership
## 18 lea_2 88 Leadership
## 19 lea_3 89 Leadership
## 20 lea_4 91 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8014842
## 2 col 0.7701817
## 3 eng 0.8103189
## 4 inc 0.8123242
## 5 lea 0.8888052
#change the factor favorable scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 80 Alignment
## 1 ali_1 91 Alignment
## 2 ali_2 82 Alignment
## 3 ali_3 67 Alignment
## 22 col 77 Collaboration
## 4 col_1 85 Collaboration
## 5 col_2 62 Collaboration
## 6 col_3 84 Collaboration
## 23 eng 81 Engagement
## 7 eng_1 94 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 83 Engagement
## 10 eng_4 69 Engagement
## 11 eng_5 72 Engagement
## 24 inc 81 Inclusion
## 12 inc_1 91 Inclusion
## 13 inc_2 76 Inclusion
## 14 inc_3 85 Inclusion
## 15 inc_4 77 Inclusion
## 16 inc_5 78 Inclusion
## 25 lea 89 Leadership
## 17 lea_1 88 Leadership
## 18 lea_2 88 Leadership
## 19 lea_3 89 Leadership
## 20 lea_4 91 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 1-2 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 1-2 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "1-2 years",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 1-2 years 0.8103189
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 81 Engagement
## 7 eng_1 94 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 83 Engagement
## 10 eng_4 69 Engagement
## 11 eng_5 72 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 1-2 Years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Male" &
df_2$tenure_group == "2-4 years" &
!is.na(df_2$gender), ]
## # A tibble: 453 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M017… NA NA NA NA NA NA 4 NA NA NA NA NA
## 2 M004… 4 5 5 4 5 5 5 3 4 4 4 4
## 3 M014… 4 4 2 3 2 3 4 4 3 2 3 5
## 4 M009… 4 5 4 4 5 5 4 1 1 3 4 5
## 5 M020… 5 5 5 5 4 5 5 5 5 5 5 5
## 6 M001… 5 5 4 3 3 4 5 5 4 2 3 5
## 7 M014… 5 5 1 5 3 2 5 5 5 4 4 5
## 8 M011… 4 5 4 4 2 1 5 4 4 4 4 4
## 9 M012… 5 4 4 5 3 5 5 5 4 5 5 5
## 10 M013… 4 4 5 4 3 4 4 5 4 4 4 4
## # ℹ 443 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Male" &
df_2$tenure_group == "2-4 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8953229
## 2 ali_2 0.8459821
## 3 ali_3 0.6674107
## 4 col_1 0.8177778
## 5 col_2 0.5915179
## 6 col_3 0.7438753
## 7 eng_1 0.9247788
## 8 eng_2 0.8716814
## 9 eng_3 0.7977778
## 10 eng_4 0.6615044
## 11 eng_5 0.7323009
## 12 inc_1 0.9159292
## 13 inc_2 0.7652370
## 14 inc_3 0.8224719
## 15 inc_4 0.7042889
## 16 inc_5 0.7645740
## 17 lea_1 0.8466667
## 18 lea_2 0.8530067
## 19 lea_3 0.8824834
## 20 lea_4 0.8933333
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 90 Alignment
## 2 ali_2 85 Alignment
## 3 ali_3 67 Alignment
## 4 col_1 82 Collaboration
## 5 col_2 59 Collaboration
## 6 col_3 74 Collaboration
## 7 eng_1 92 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 80 Engagement
## 10 eng_4 66 Engagement
## 11 eng_5 73 Engagement
## 12 inc_1 92 Inclusion
## 13 inc_2 77 Inclusion
## 14 inc_3 82 Inclusion
## 15 inc_4 70 Inclusion
## 16 inc_5 76 Inclusion
## 17 lea_1 85 Leadership
## 18 lea_2 85 Leadership
## 19 lea_3 88 Leadership
## 20 lea_4 89 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8029053
## 2 col 0.7177236
## 3 eng 0.7976087
## 4 inc 0.7945002
## 5 lea 0.8688725
#change the favorability scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 80 Alignment
## 1 ali_1 90 Alignment
## 2 ali_2 85 Alignment
## 3 ali_3 67 Alignment
## 22 col 72 Collaboration
## 4 col_1 82 Collaboration
## 5 col_2 59 Collaboration
## 6 col_3 74 Collaboration
## 23 eng 80 Engagement
## 7 eng_1 92 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 80 Engagement
## 10 eng_4 66 Engagement
## 11 eng_5 73 Engagement
## 24 inc 79 Inclusion
## 12 inc_1 92 Inclusion
## 13 inc_2 77 Inclusion
## 14 inc_3 82 Inclusion
## 15 inc_4 70 Inclusion
## 16 inc_5 76 Inclusion
## 25 lea 87 Leadership
## 17 lea_1 85 Leadership
## 18 lea_2 85 Leadership
## 19 lea_3 88 Leadership
## 20 lea_4 89 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 2-4 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 2-4 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "2-4 years",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 2-4 years 0.7976087
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 80 Engagement
## 7 eng_1 92 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 80 Engagement
## 10 eng_4 66 Engagement
## 11 eng_5 73 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 2-4 Years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Male" &
df_2$tenure_group == "4-6 years" &
!is.na(df_2$gender), ]
## # A tibble: 216 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M008… 5 5 5 5 5 5 5 5 5 5 5 5
## 2 M001… 5 5 4 5 5 5 5 5 5 4 3 5
## 3 M002… 5 5 4 5 4 4 5 5 5 5 5 5
## 4 M018… 4 4 4 2 5 5 4 1 3 4 3 4
## 5 M013… 5 5 4 4 NA 5 5 5 5 4 5 5
## 6 M006… 5 5 2 5 3 4 5 5 5 4 2 4
## 7 M009… 5 5 3 5 5 5 5 5 5 5 5 5
## 8 M005… 4 4 4 4 3 3 4 4 4 4 4 4
## 9 M023… 5 5 5 4 5 5 5 4 5 3 3 5
## 10 M007… 5 5 3 5 4 5 5 5 5 5 5 5
## # ℹ 206 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Male" &
df_2$tenure_group == "4-6 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.8930233
## 2 ali_2 0.8262911
## 3 ali_3 0.7289720
## 4 col_1 0.8465116
## 5 col_2 0.6525822
## 6 col_3 0.7428571
## 7 eng_1 0.9120370
## 8 eng_2 0.8697674
## 9 eng_3 0.8000000
## 10 eng_4 0.6744186
## 11 eng_5 0.6481481
## 12 inc_1 0.9052133
## 13 inc_2 0.7572816
## 14 inc_3 0.8564593
## 15 inc_4 0.7439614
## 16 inc_5 0.7403846
## 17 lea_1 0.7943925
## 18 lea_2 0.8651163
## 19 lea_3 0.8925234
## 20 lea_4 0.8738318
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 89 Alignment
## 2 ali_2 83 Alignment
## 3 ali_3 73 Alignment
## 4 col_1 85 Collaboration
## 5 col_2 65 Collaboration
## 6 col_3 74 Collaboration
## 7 eng_1 91 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 80 Engagement
## 10 eng_4 67 Engagement
## 11 eng_5 65 Engagement
## 12 inc_1 91 Inclusion
## 13 inc_2 76 Inclusion
## 14 inc_3 86 Inclusion
## 15 inc_4 74 Inclusion
## 16 inc_5 74 Inclusion
## 17 lea_1 79 Leadership
## 18 lea_2 87 Leadership
## 19 lea_3 89 Leadership
## 20 lea_4 87 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8160954
## 2 col 0.7473170
## 3 eng 0.7808742
## 4 inc 0.8006600
## 5 lea 0.8564660
#change the favorability scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 82 Alignment
## 1 ali_1 89 Alignment
## 2 ali_2 83 Alignment
## 3 ali_3 73 Alignment
## 22 col 75 Collaboration
## 4 col_1 85 Collaboration
## 5 col_2 65 Collaboration
## 6 col_3 74 Collaboration
## 23 eng 78 Engagement
## 7 eng_1 91 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 80 Engagement
## 10 eng_4 67 Engagement
## 11 eng_5 65 Engagement
## 24 inc 80 Inclusion
## 12 inc_1 91 Inclusion
## 13 inc_2 76 Inclusion
## 14 inc_3 86 Inclusion
## 15 inc_4 74 Inclusion
## 16 inc_5 74 Inclusion
## 25 lea 86 Leadership
## 17 lea_1 79 Leadership
## 18 lea_2 87 Leadership
## 19 lea_3 89 Leadership
## 20 lea_4 87 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 4-6 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 4-6 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "4-6 years",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 4-6 years 0.7808742
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 78 Engagement
## 7 eng_1 91 Engagement
## 8 eng_2 87 Engagement
## 9 eng_3 80 Engagement
## 10 eng_4 67 Engagement
## 11 eng_5 65 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 4-6 Years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
table(df_2$gender, df_2$tenure_group)
##
## 1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
## Female 211 1 191 116 58 37
## Male 459 3 453 209 216 126
##
## 6-12 months Under 3 months
## Female 181 25
## Male 325 39
df_2[
df_2$gender == "Male" &
df_2$tenure_group == "6-10 years" &
!is.na(df_2$gender), ]
## # A tibble: 126 × 29
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M017… 4 4 4 4 4 3 4 4 4 4 4 4
## 2 M021… 5 5 4 5 4 5 5 5 4 5 5 5
## 3 M025… 4 4 4 4 4 4 5 5 3 3 4 4
## 4 M022… 3 3 4 5 3 3 4 3 3 2 2 4
## 5 M016… 5 5 4 5 5 5 5 5 5 5 5 5
## 6 M016… 4 4 3 4 3 4 4 4 3 3 3 4
## 7 M011… 4 NA NA 4 4 NA 4 4 4 4 3 4
## 8 M011… 5 5 5 4 3 3 4 5 4 4 4 4
## 9 M023… 4 4 4 4 4 4 4 5 3 2 4 4
## 10 M014… NA NA NA 2 2 1 4 4 3 2 3 2
## # ℹ 116 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
df_2$gender == "Male" &
df_2$tenure_group == "6-10 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 ali_1 0.9354839
## 2 ali_2 0.8943089
## 3 ali_3 0.7723577
## 4 col_1 0.8492063
## 5 col_2 0.7200000
## 6 col_3 0.7680000
## 7 eng_1 0.9365079
## 8 eng_2 0.8888889
## 9 eng_3 0.7698413
## 10 eng_4 0.6904762
## 11 eng_5 0.7200000
## 12 inc_1 0.8861789
## 13 inc_2 0.7166667
## 14 inc_3 0.7786885
## 15 inc_4 0.6290323
## 16 inc_5 0.7704918
## 17 lea_1 0.8080000
## 18 lea_2 0.8548387
## 19 lea_3 0.8480000
## 20 lea_4 0.8560000
favorability_score_percent <- favorability_score
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
TRUE ~ "Other")
print(favorability_score_percent)
## question_number favorability_score Factor
## 1 ali_1 94 Alignment
## 2 ali_2 89 Alignment
## 3 ali_3 77 Alignment
## 4 col_1 85 Collaboration
## 5 col_2 72 Collaboration
## 6 col_3 77 Collaboration
## 7 eng_1 94 Engagement
## 8 eng_2 89 Engagement
## 9 eng_3 77 Engagement
## 10 eng_4 69 Engagement
## 11 eng_5 72 Engagement
## 12 inc_1 89 Inclusion
## 13 inc_2 72 Inclusion
## 14 inc_3 78 Inclusion
## 15 inc_4 63 Inclusion
## 16 inc_5 77 Inclusion
## 17 lea_1 81 Leadership
## 18 lea_2 85 Leadership
## 19 lea_3 85 Leadership
## 20 lea_4 86 Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(factor_favorable_score,
data.frame(factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score))
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score
## 1 ali 0.8673835
## 2 col 0.7790688
## 3 eng 0.8011429
## 4 inc 0.7562116
## 5 lea 0.8417097
#change the favorability scores into percentage
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100
library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
TRUE ~ "Other")
#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Factor
## 21 ali 87 Alignment
## 1 ali_1 94 Alignment
## 2 ali_2 89 Alignment
## 3 ali_3 77 Alignment
## 22 col 78 Collaboration
## 4 col_1 85 Collaboration
## 5 col_2 72 Collaboration
## 6 col_3 77 Collaboration
## 23 eng 80 Engagement
## 7 eng_1 94 Engagement
## 8 eng_2 89 Engagement
## 9 eng_3 77 Engagement
## 10 eng_4 69 Engagement
## 11 eng_5 72 Engagement
## 24 inc 76 Inclusion
## 12 inc_1 89 Inclusion
## 13 inc_2 72 Inclusion
## 14 inc_3 78 Inclusion
## 15 inc_4 63 Inclusion
## 16 inc_5 77 Inclusion
## 25 lea 84 Leadership
## 17 lea_1 81 Leadership
## 18 lea_2 85 Leadership
## 19 lea_3 85 Leadership
## 20 lea_4 86 Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-10 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
## Create a ggplot bar plot for factor favorable scores
library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
geom_bar(stat = "identity", show.legend = TRUE) +
labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-10 years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) + # Add text labels to the bars
scale_fill_manual(values = group_colors) # Apply custom colors to the bars
#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
question_number = "6-10 years",
favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])
engagement_by_tenure
## question_number favorability_score
## 1 6-10 years 0.8011429
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]
engagement_favorable_score_for_all
## question_number favorability_score Factor
## 23 eng 80 Engagement
## 7 eng_1 94 Engagement
## 8 eng_2 89 Engagement
## 9 eng_3 77 Engagement
## 10 eng_4 69 Engagement
## 11 eng_5 72 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 6-10 Years") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors)
#Create a new variable named 'engagement_factor', representing the average engagement scores.
df_2$engagement_factor <- rowMeans(df_2[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)
#Select only the numeric columns from 'df_2' excluding 'eng_1' to 'eng_5'
numeric_df_2 <- Filter(is.numeric, df_2)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "engagement_factor",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
library(corrplot)
## corrplot 0.92 loaded
# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")
# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]
engagement_factor_correlation_table <- data.frame(
variable = names(engagement_correlations),
correlation = engagement_correlations
)
engagement_factor_correlation_table <- engagement_factor_correlation_table[
engagement_factor_correlation_table$variable != "engagement_factor",
]
# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
arrange(correlation)
# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]
# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
engagement_factor_correlation_table$variable,
levels = engagement_factor_correlation_table$variable
)
# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement", caption = "Correlations calculated using Pearson's r"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
#engagement factor's correlations with other factors
library(dplyr)
#Create a df with composite factor scores for each group of items
composite_scores <- df_2 %>%
mutate(
Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE), # Alignment score
Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE), # Collaboration score
Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE), # Engagement score
Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE), # Inclusion score
Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE) # Leadership score
) %>%
# Select only the composite factor scores and an identifier (if needed)
select(Alignment, Collaboration, Engagement, Inclusion, Leadership)
# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 2,651 × 5
## Alignment Collaboration Engagement Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.67 3 5 4.6 4.75
## 2 4 4 3.6 3.8 4.75
## 3 4.67 4.67 4.4 5 5
## 4 5 5 5 4.4 5
## 5 4.67 5 4.4 4.8 5
## 6 3.67 4 3.8 3.4 3.25
## 7 3.67 3 3.4 4 5
## 8 4.67 4.67 4.6 4.6 5
## 9 4.67 4.33 5 4.8 5
## 10 5 4.67 4.4 4.6 4.5
## # ℹ 2,641 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 2,651 × 4
## Alignment Collaboration Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl>
## 1 4.67 3 4.6 4.75
## 2 4 4 3.8 4.75
## 3 4.67 4.67 5 5
## 4 5 5 4.4 5
## 5 4.67 5 4.8 5
## 6 3.67 4 3.4 3.25
## 7 3.67 3 4 5
## 8 4.67 4.67 4.6 5
## 9 4.67 4.33 4.8 5
## 10 5 4.67 4.6 4.5
## # ℹ 2,641 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 2,651 × 1
## Engagement
## <dbl>
## 1 5
## 2 3.6
## 3 4.4
## 4 5
## 5 4.4
## 6 3.8
## 7 3.4
## 8 4.6
## 9 5
## 10 4.4
## # ℹ 2,641 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "Engagement",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
- b. for women only (correlation calculated by conducting kendall's tau-b)
library(dplyr)
# Subset df_2 to include only female employees
df_2_female <- df_2 %>%
filter(gender == "Female")
# Calculate the engagement factor for female employees
df_2_female$engagement_factor <- rowMeans(df_2_female[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)
numeric_df_2 <- Filter(is.numeric, df_2_female)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "engagement_factor",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Women", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")
# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]
engagement_factor_correlation_table <- data.frame(
variable = names(engagement_correlations),
correlation = engagement_correlations
)
engagement_factor_correlation_table <- engagement_factor_correlation_table[
engagement_factor_correlation_table$variable != "engagement_factor",
]
# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
arrange(correlation)
# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]
# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
engagement_factor_correlation_table$variable,
levels = engagement_factor_correlation_table$variable
)
# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Women", caption = "Correlations calculated using Pearson's r"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
#engagement factor's correlations with other factors
library(dplyr)
#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$gender == "Female", ] %>%
mutate(
Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE), # Alignment score
Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE), # Collaboration score
Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE), # Engagement score
Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE), # Inclusion score
Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE) # Leadership score
) %>%
# Select only the composite factor scores and an identifier (if needed)
select(Alignment, Collaboration, Engagement, Inclusion, Leadership)
# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 821 × 5
## Alignment Collaboration Engagement Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.67 4 3.8 3.4 3.25
## 2 5 4.67 4.4 4.6 4.5
## 3 4.33 4.33 4.8 4.6 5
## 4 4.33 4.33 5 4.4 4.25
## 5 4.33 4 3.8 4.6 4.75
## 6 4.33 4 4.2 3.4 4
## 7 5 5 5 4.6 5
## 8 2 3.33 2.6 2.8 4
## 9 4.67 4.33 3.6 5 4.25
## 10 4 4 4.4 4.4 4
## # ℹ 811 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 821 × 4
## Alignment Collaboration Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl>
## 1 3.67 4 3.4 3.25
## 2 5 4.67 4.6 4.5
## 3 4.33 4.33 4.6 5
## 4 4.33 4.33 4.4 4.25
## 5 4.33 4 4.6 4.75
## 6 4.33 4 3.4 4
## 7 5 5 4.6 5
## 8 2 3.33 2.8 4
## 9 4.67 4.33 5 4.25
## 10 4 4 4.4 4
## # ℹ 811 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 821 × 1
## Engagement
## <dbl>
## 1 3.8
## 2 4.4
## 3 4.8
## 4 5
## 5 3.8
## 6 4.2
## 7 5
## 8 2.6
## 9 3.6
## 10 4.4
## # ℹ 811 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "Engagement",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Women", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
c. for male employees (correlation calculated by conducting kendall’s
tau-b)
library(dplyr)
# Subset df_2 to include only male employees
df_2_male <- df_2 %>%
filter(gender == "Male")
# Calculate the engagement factor for female employees
df_2_male$engagement_factor <- rowMeans(df_2_male[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)
# Display a few rows to check the new variable
head(df_2_male)
## # A tibble: 6 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01434 5 5 4 5 3 1 5 5 5 5 5 5
## 2 M00631 4 4 4 4 4 4 4 4 4 3 3 4
## 3 M00325 5 5 4 5 4 5 5 5 5 4 3 5
## 4 M00805 5 5 5 5 5 5 5 5 5 5 5 5
## 5 M00157 5 5 4 5 5 5 5 5 5 4 3 5
## 6 M01220 4 3 4 3 3 3 4 3 4 3 3 4
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
numeric_df_2 <- Filter(is.numeric, df_2_male)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "engagement_factor",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Men", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")
# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]
engagement_factor_correlation_table <- data.frame(
variable = names(engagement_correlations),
correlation = engagement_correlations
)
engagement_factor_correlation_table <- engagement_factor_correlation_table[
engagement_factor_correlation_table$variable != "engagement_factor",
]
# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
arrange(correlation)
# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]
# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
engagement_factor_correlation_table$variable,
levels = engagement_factor_correlation_table$variable
)
# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Men", caption = "Correlations calculated using Pearson's r"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
#engagement factor's correlations with other factors
library(dplyr)
#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$gender == "Male", ] %>%
mutate(
Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE), # Alignment score
Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE), # Collaboration score
Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE), # Engagement score
Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE), # Inclusion score
Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE) # Leadership score
) %>%
# Select only the composite factor scores and an identifier (if needed)
select(Alignment, Collaboration, Engagement, Inclusion, Leadership)
# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 1,831 × 5
## Alignment Collaboration Engagement Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.67 3 5 4.6 4.75
## 2 4 4 3.6 3.8 4.75
## 3 4.67 4.67 4.4 5 5
## 4 5 5 5 4.4 5
## 5 4.67 5 4.4 4.8 5
## 6 3.67 3 3.4 4 5
## 7 4.67 4.67 4.6 4.6 5
## 8 4.67 4.33 5 4.8 5
## 9 4.67 5 4 5 5
## 10 4 3.67 4 3.6 4
## # ℹ 1,821 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 1,831 × 4
## Alignment Collaboration Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl>
## 1 4.67 3 4.6 4.75
## 2 4 4 3.8 4.75
## 3 4.67 4.67 5 5
## 4 5 5 4.4 5
## 5 4.67 5 4.8 5
## 6 3.67 3 4 5
## 7 4.67 4.67 4.6 5
## 8 4.67 4.33 4.8 5
## 9 4.67 5 5 5
## 10 4 3.67 3.6 4
## # ℹ 1,821 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 1,831 × 1
## Engagement
## <dbl>
## 1 5
## 2 3.6
## 3 4.4
## 4 5
## 5 4.4
## 6 3.4
## 7 4.6
## 8 5
## 9 4
## 10 4
## # ℹ 1,821 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "Engagement",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Men", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
d. for Indian employees (correlation calculated by conducting kendall’s
tau-b)
library(dplyr)
# Subset df_2 to include only Indian employees
df_2_india <- df_2 %>%
filter(country == "India")
# Calculate the engagement factor for female employees
df_2_india$engagement_factor <- rowMeans(df_2_india[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)
# Display a few rows to check the new variable
head(df_2_india)
## # A tibble: 6 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01471 5 5 4 5 5 5 5 5 4 3 3 5
## 2 M02064 5 4 3 5 3 4 5 5 3 4 4 5
## 3 M00210 5 5 5 5 5 5 5 5 5 4 4 5
## 4 M02430 5 5 4 3 5 5 5 5 3 2 3 5
## 5 M01775 5 5 5 5 5 5 5 5 5 5 5 5
## 6 M01748 3 4 3 4 3 5 5 5 5 4 5 2
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
numeric_df_2 <- Filter(is.numeric, df_2_india)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "engagement_factor",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Indian Employees", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")
# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]
engagement_factor_correlation_table <- data.frame(
variable = names(engagement_correlations),
correlation = engagement_correlations
)
engagement_factor_correlation_table <- engagement_factor_correlation_table[
engagement_factor_correlation_table$variable != "engagement_factor",
]
# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
arrange(correlation)
# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]
# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
engagement_factor_correlation_table$variable,
levels = engagement_factor_correlation_table$variable
)
# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Indian Employees", caption = "Correlations calculated using Pearson's r"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
#engagement factor's correlations with other factors
library(dplyr)
#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$country == "India", ] %>%
mutate(
Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE), # Alignment score
Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE), # Collaboration score
Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE), # Engagement score
Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE), # Inclusion score
Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE) # Leadership score
) %>%
# Select only the composite factor scores and an identifier (if needed)
select(Alignment, Collaboration, Engagement, Inclusion, Leadership)
# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 288 × 5
## Alignment Collaboration Engagement Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.67 5 4 5 5
## 2 4 4 4.2 5 5
## 3 5 5 4.6 5 5
## 4 4.67 4.33 3.6 5 4.25
## 5 5 5 5 5 5
## 6 3.33 4 4.8 3.6 4
## 7 4.67 5 4.2 5 5
## 8 3 3.67 4.6 4.2 3.5
## 9 4 4 4.8 4 4
## 10 4.67 5 5 5 5
## # ℹ 278 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 288 × 4
## Alignment Collaboration Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl>
## 1 4.67 5 5 5
## 2 4 4 5 5
## 3 5 5 5 5
## 4 4.67 4.33 5 4.25
## 5 5 5 5 5
## 6 3.33 4 3.6 4
## 7 4.67 5 5 5
## 8 3 3.67 4.2 3.5
## 9 4 4 4 4
## 10 4.67 5 5 5
## # ℹ 278 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 288 × 1
## Engagement
## <dbl>
## 1 4
## 2 4.2
## 3 4.6
## 4 3.6
## 5 5
## 6 4.8
## 7 4.2
## 8 4.6
## 9 4.8
## 10 5
## # ℹ 278 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "Engagement",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Indian Employees", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
e. for German employees (correlation calculated by conducting kendall’s
tau-b)
library(dplyr)
# Subset df_2 to include only Indian employees
df_2_germany <- df_2 %>%
filter(country == "Germany")
# Calculate the engagement factor for german employees
df_2_germany$engagement_factor <- rowMeans(df_2_germany[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)
# Display a few rows to check the new variable
head(df_2_germany)
## # A tibble: 6 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M00166 5 5 4 3 3 4 5 5 4 2 3 5
## 2 M01969 4 4 4 5 5 5 5 5 4 3 5 4
## 3 M01232 4 4 3 3 3 3 3 4 4 3 4 3
## 4 M01944 3 4 4 4 2 4 4 4 3 3 3 4
## 5 M00395 4 4 5 4 4 4 5 4 5 4 4 5
## 6 M00532 5 5 3 4 2 2 3 4 4 2 3 4
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
numeric_df_2 <- Filter(is.numeric, df_2_germany)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "engagement_factor",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for German Employees", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")
# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]
engagement_factor_correlation_table <- data.frame(
variable = names(engagement_correlations),
correlation = engagement_correlations
)
engagement_factor_correlation_table <- engagement_factor_correlation_table[
engagement_factor_correlation_table$variable != "engagement_factor",
]
# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
arrange(correlation)
# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]
# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
engagement_factor_correlation_table$variable,
levels = engagement_factor_correlation_table$variable
)
# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for German Employees", caption = "Correlations calculated using Pearson's r"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
#engagement factor's correlations with other factors
library(dplyr)
#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$country == "Germany", ] %>%
mutate(
Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE), # Alignment score
Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE), # Collaboration score
Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE), # Engagement score
Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE), # Inclusion score
Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE) # Leadership score
) %>%
# Select only the composite factor scores and an identifier (if needed)
select(Alignment, Collaboration, Engagement, Inclusion, Leadership)
# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 48 × 5
## Alignment Collaboration Engagement Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.67 3.33 3.8 4 4
## 2 4 5 4.4 3.8 5
## 3 3.67 3 3.6 2.8 3.5
## 4 3.67 3.33 3.4 3.6 4
## 5 4.33 4 4.4 5 4.75
## 6 4.33 2.67 3.2 3.4 2.75
## 7 4 3.67 4.4 4.8 4.25
## 8 4 3.33 4 5 5
## 9 5 4.33 4.6 5 5
## 10 3.67 3 3 5 4
## # ℹ 38 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 48 × 4
## Alignment Collaboration Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl>
## 1 4.67 3.33 4 4
## 2 4 5 3.8 5
## 3 3.67 3 2.8 3.5
## 4 3.67 3.33 3.6 4
## 5 4.33 4 5 4.75
## 6 4.33 2.67 3.4 2.75
## 7 4 3.67 4.8 4.25
## 8 4 3.33 5 5
## 9 5 4.33 5 5
## 10 3.67 3 5 4
## # ℹ 38 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 48 × 1
## Engagement
## <dbl>
## 1 3.8
## 2 4.4
## 3 3.6
## 4 3.4
## 5 4.4
## 6 3.2
## 7 4.4
## 8 4
## 9 4.6
## 10 3
## # ℹ 38 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "Engagement",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for German Employees", caption = "Correlations calculated using Kendall's tau-b"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
f. for employees who are high (engagement factor composite score > or
= 4) on employee_engagement (correlation calculated by conducting
kendall’s tau-b)
library(dplyr)
# Calculate the engagement factor for employees
df_2$engagement_factor <- rowMeans(df_2[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)
hist(df_2$engagement_factor)
high_engagement <- df_2 %>%
filter(engagement_factor >= 4)
numeric_df_2 <- Filter(is.numeric, high_engagement)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "engagement_factor",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for highly engaging employees", caption = "Correlations calculated using Kendall's tau-b | employees with engagement composite factor score greater than or equal to 4 are only included for analysis"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")
# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]
engagement_factor_correlation_table <- data.frame(
variable = names(engagement_correlations),
correlation = engagement_correlations
)
engagement_factor_correlation_table <- engagement_factor_correlation_table[
engagement_factor_correlation_table$variable != "engagement_factor",
]
# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
arrange(correlation)
# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]
# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
engagement_factor_correlation_table$variable,
levels = engagement_factor_correlation_table$variable
)
# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Highly Engaged Employees", caption = "Correlations calculated using Pearson's r | employees with engagement composite factor score greater than or equal to 4 are only included for analysis"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
#engagement factor's correlations with other factors
library(dplyr)
#Create a df with composite factor scores for each group of items
composite_scores <- df_2 %>% mutate(
Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE), # Alignment score
Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE), # Collaboration score
Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE), # Engagement score
Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE), # Inclusion score
Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE) # Leadership score
) %>%
# Select only the composite factor scores and an identifier (if needed)
select(Alignment, Collaboration, Engagement, Inclusion, Leadership)
# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 2,651 × 5
## Alignment Collaboration Engagement Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.67 3 5 4.6 4.75
## 2 4 4 3.6 3.8 4.75
## 3 4.67 4.67 4.4 5 5
## 4 5 5 5 4.4 5
## 5 4.67 5 4.4 4.8 5
## 6 3.67 4 3.8 3.4 3.25
## 7 3.67 3 3.4 4 5
## 8 4.67 4.67 4.6 4.6 5
## 9 4.67 4.33 5 4.8 5
## 10 5 4.67 4.4 4.6 4.5
## # ℹ 2,641 more rows
composite_scores_high <- composite_scores %>%
filter(Engagement >= 4)
composite_factor_scores <- subset(composite_scores_high, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 1,819 × 4
## Alignment Collaboration Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl>
## 1 4.67 3 4.6 4.75
## 2 4.67 4.67 5 5
## 3 5 5 4.4 5
## 4 4.67 5 4.8 5
## 5 4.67 4.67 4.6 5
## 6 4.67 4.33 4.8 5
## 7 5 4.67 4.6 4.5
## 8 4.67 5 5 5
## 9 4 3.67 3.6 4
## 10 NaN NaN NaN NaN
## # ℹ 1,809 more rows
engagement_factor_scores <- subset(composite_scores_high, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 1,819 × 1
## Engagement
## <dbl>
## 1 5
## 2 4.4
## 3 5
## 4 4.4
## 5 4.6
## 6 5
## 7 4.4
## 8 4
## 9 4
## 10 4
## # ℹ 1,809 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "Engagement",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Highly Engaged Employees", caption = "Correlations calculated using Kendall's tau-b | employees with engagement composite factor score greater than or equal to 4 are only included for analysis"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
library(dplyr)
# Calculate the engagement factor for employees
df_2$engagement_factor <- rowMeans(df_2[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)
hist(df_2$engagement_factor)
low_engagement <- df_2 %>%
filter(engagement_factor <= 2)
numeric_df_2 <- Filter(is.numeric, low_engagement)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "engagement_factor",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Less Engaged employees", caption = "Correlations calculated using Kendall's tau-b | Only employees with engagement composite factor score lower than or equal to 2 are included for this analysis"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")
# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]
engagement_factor_correlation_table <- data.frame(
variable = names(engagement_correlations),
correlation = engagement_correlations
)
engagement_factor_correlation_table <- engagement_factor_correlation_table[
engagement_factor_correlation_table$variable != "engagement_factor",
]
# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
arrange(correlation)
# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]
# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
engagement_factor_correlation_table$variable,
levels = engagement_factor_correlation_table$variable
)
# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Less Engaged Employees", caption = "Correlations calculated using Pearson's r | Only employees with engagement composite factor score lower than or equal to 2 are included for this analysis"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
#engagement factor's correlations with other factors
library(dplyr)
#Create a df with composite factor scores for each group of items
composite_scores <- df_2 %>% mutate(
Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE), # Alignment score
Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE), # Collaboration score
Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE), # Engagement score
Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE), # Inclusion score
Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE) # Leadership score
) %>%
# Select only the composite factor scores and an identifier (if needed)
select(Alignment, Collaboration, Engagement, Inclusion, Leadership)
# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 2,651 × 5
## Alignment Collaboration Engagement Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.67 3 5 4.6 4.75
## 2 4 4 3.6 3.8 4.75
## 3 4.67 4.67 4.4 5 5
## 4 5 5 5 4.4 5
## 5 4.67 5 4.4 4.8 5
## 6 3.67 4 3.8 3.4 3.25
## 7 3.67 3 3.4 4 5
## 8 4.67 4.67 4.6 4.6 5
## 9 4.67 4.33 5 4.8 5
## 10 5 4.67 4.4 4.6 4.5
## # ℹ 2,641 more rows
composite_scores_low <- composite_scores %>%
filter(Engagement <= 2)
composite_factor_scores <- subset(composite_scores_low, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 47 × 4
## Alignment Collaboration Inclusion Leadership
## <dbl> <dbl> <dbl> <dbl>
## 1 3.33 4 4.2 1.75
## 2 2 1 2.2 3.25
## 3 2.33 3.33 2 2
## 4 3 4 3.6 3
## 5 1.67 4 2.8 2.5
## 6 2.33 2 3 1
## 7 2.33 3 3.8 3
## 8 1.67 3.67 2.6 4.5
## 9 3 2.67 3 2.5
## 10 1.67 2.67 2.8 2.75
## # ℹ 37 more rows
engagement_factor_scores <- subset(composite_scores_low, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 47 × 1
## Engagement
## <dbl>
## 1 1.8
## 2 1.2
## 3 1.8
## 4 2
## 5 2
## 6 1.4
## 7 2
## 8 1.8
## 9 2
## 10 1.8
## # ℹ 37 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")
#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]
engagement_factor_correlation_table_kendall <- data.frame(
variable = names(engagement_correlations_kendall),
correlation = engagement_correlations_kendall
)
#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
engagement_factor_correlation_table_kendall$variable != "Engagement",
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
arrange(correlation)
#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]
# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
engagement_factor_correlation_table_kendall$variable,
levels = engagement_factor_correlation_table_kendall$variable
)
# Use scales package to format correlations as percentages
library(scales)
# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black") +
labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Less Engaged Employees", caption = "Correlations calculated using Kendall's tau-b | Only employees with engagement composite factor score lower than or equal to 2 are included for analysis"
) +
theme_minimal() +
theme(
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.caption = element_text(size = 10, hjust = 0)
)
df_2[df_2$country == "Australia" & !is.na(df_2$country), ]
## # A tibble: 101 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M017… 4 4 4 4 4 3 4 4 4 4 4 4
## 2 M017… 4 4 3 4 4 4 4 4 5 4 4 4
## 3 M012… 4 3 4 3 4 4 4 4 3 3 2 4
## 4 M006… 5 5 5 4 5 3 5 5 5 5 5 5
## 5 M024… 5 5 3 5 5 2 5 5 4 5 4 4
## 6 M025… 5 5 5 5 4 5 4 5 4 5 5 5
## 7 M002… 5 3 4 4 3 1 5 4 4 2 3 5
## 8 M000… 5 5 5 5 5 5 5 5 5 5 5 5
## 9 M012… 4 4 4 4 4 3 4 4 4 3 4 4
## 10 M023… 4 5 4 4 3 2 4 5 4 4 3 4
## # ℹ 91 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Australia", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9207921
## 2 eng_2 0.9306931
## 3 eng_3 0.8514851
## 4 eng_4 0.6930693
## 5 eng_5 0.7425743
## 6 lea_1 0.8613861
## 7 lea_2 0.9207921
## 8 lea_3 0.9603960
## 9 lea_4 0.9603960
# Calculate factor favorability score for engagement & leadership
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = "Australia" # Add the country label
)
)
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.8277228 Australia
## 2 lea 0.9257426 Australia
favorability_score$Country <- "Australia"
#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Country
## 10 eng 0.8277228 Australia
## 1 eng_1 0.9207921 Australia
## 2 eng_2 0.9306931 Australia
## 3 eng_3 0.8514851 Australia
## 4 eng_4 0.6930693 Australia
## 5 eng_5 0.7425743 Australia
## 11 lea 0.9257426 Australia
## 6 lea_1 0.8613861 Australia
## 7 lea_2 0.9207921 Australia
## 8 lea_3 0.9603960 Australia
## 9 lea_4 0.9603960 Australia
#let's only keep engagement scores
engagement_favorable_scores <- combined_favorability_df[1:6, ]
print(engagement_favorable_scores)
## question_number favorability_score Country
## 10 eng 0.8277228 Australia
## 1 eng_1 0.9207921 Australia
## 2 eng_2 0.9306931 Australia
## 3 eng_3 0.8514851 Australia
## 4 eng_4 0.6930693 Australia
## 5 eng_5 0.7425743 Australia
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
library(scales)
# Create a ggplot that shows engagement factor favorable scores for Aussie employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Aussie Employees") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors) +
scale_y_continuous(labels = percent_format(accuracy = 1))
- b. Canada’s N = 84
df_2[df_2$country == "Canada" & !is.na(df_2$country), ]
## # A tibble: 84 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M002… 5 5 4 5 4 4 5 5 5 5 5 5
## 2 M001… 5 5 3 5 3 5 5 5 5 4 5 5
## 3 M022… 5 5 5 5 4 3 5 5 5 5 5 5
## 4 M023… 5 5 5 5 5 5 4 5 5 4 5 4
## 5 M009… 4 4 4 5 5 5 4 4 4 4 4 5
## 6 M013… 5 5 4 5 5 5 5 5 5 4 5 5
## 7 M015… 4 4 4 4 4 4 4 4 4 4 4 4
## 8 M006… 5 5 4 4 NA 2 5 5 5 5 5 5
## 9 M012… 5 5 4 4 4 5 5 5 5 3 3 5
## 10 M019… 4 4 4 4 3 3 4 4 4 4 4 4
## # ℹ 74 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Canada", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9285714
## 2 eng_2 0.8333333
## 3 eng_3 0.8690476
## 4 eng_4 0.7619048
## 5 eng_5 0.7976190
## 6 lea_1 0.8452381
## 7 lea_2 0.8333333
## 8 lea_3 0.9404762
## 9 lea_4 0.8313253
# Calculate factor favorability score for engagement & leadership
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = "Canada" # Add the country label
)
)
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.8380952 Canada
## 2 lea 0.8625932 Canada
favorability_score$Country <- "Canada"
#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Country
## 10 eng 0.8380952 Canada
## 1 eng_1 0.9285714 Canada
## 2 eng_2 0.8333333 Canada
## 3 eng_3 0.8690476 Canada
## 4 eng_4 0.7619048 Canada
## 5 eng_5 0.7976190 Canada
## 11 lea 0.8625932 Canada
## 6 lea_1 0.8452381 Canada
## 7 lea_2 0.8333333 Canada
## 8 lea_3 0.9404762 Canada
## 9 lea_4 0.8313253 Canada
#let's only keep engagement scores
engagement_favorable_scores <- combined_favorability_df[1:6, ]
print(engagement_favorable_scores)
## question_number favorability_score Country
## 10 eng 0.8380952 Canada
## 1 eng_1 0.9285714 Canada
## 2 eng_2 0.8333333 Canada
## 3 eng_3 0.8690476 Canada
## 4 eng_4 0.7619048 Canada
## 5 eng_5 0.7976190 Canada
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
library(scales)
# Create a ggplot that shows engagement factor favorable scores for Danish employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Canadian Employees") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors) +
scale_y_continuous(labels = percent_format(accuracy = 1))
- c. China’s N = 60
df_2[df_2$country == "China" & !is.na(df_2$country), ]
## # A tibble: 60 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M000… 5 4 4 3 5 3 5 5 5 5 5 5
## 2 M012… 5 5 4 5 4 4 4 4 4 5 5 4
## 3 M019… 4 4 4 4 4 4 4 4 4 4 5 4
## 4 M012… 4 2 2 4 5 3 4 3 4 3 3 5
## 5 M021… 4 2 3 3 3 1 3 3 3 2 3 3
## 6 M001… 4 4 4 2 3 3 4 3 5 4 4 4
## 7 M020… 4 5 4 5 4 4 5 5 5 5 5 5
## 8 M014… 4 4 4 5 3 3 5 5 3 4 5 5
## 9 M022… 5 5 4 5 5 5 5 5 5 5 5 5
## 10 M004… 5 5 5 5 5 5 5 5 5 5 5 5
## # ℹ 50 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "China", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.8666667
## 2 eng_2 0.7833333
## 3 eng_3 0.7833333
## 4 eng_4 0.7833333
## 5 eng_5 0.8166667
## 6 lea_1 0.8333333
## 7 lea_2 0.8166667
## 8 lea_3 0.8833333
## 9 lea_4 0.8666667
# Calculate factor favorability score for engagement & leadership
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = "China" # Add the country label
)
)
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.8066667 China
## 2 lea 0.8500000 China
favorability_score$Country <- "China"
#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Country
## 10 eng 0.8066667 China
## 1 eng_1 0.8666667 China
## 2 eng_2 0.7833333 China
## 3 eng_3 0.7833333 China
## 4 eng_4 0.7833333 China
## 5 eng_5 0.8166667 China
## 11 lea 0.8500000 China
## 6 lea_1 0.8333333 China
## 7 lea_2 0.8166667 China
## 8 lea_3 0.8833333 China
## 9 lea_4 0.8666667 China
#let's only keep engagement scores
engagement_favorable_scores <- combined_favorability_df[1:6, ]
print(engagement_favorable_scores)
## question_number favorability_score Country
## 10 eng 0.8066667 China
## 1 eng_1 0.8666667 China
## 2 eng_2 0.7833333 China
## 3 eng_3 0.7833333 China
## 4 eng_4 0.7833333 China
## 5 eng_5 0.8166667 China
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
library(scales)
# Create a ggplot that shows engagement factor favorable scores for Chinese employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Chinese Employees") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors) +
scale_y_continuous(labels = percent_format(accuracy = 1))
- d. Denmark’s N = 24
df_2[df_2$country == "Denmark" & !is.na(df_2$country), ]
## # A tibble: 24 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M022… 4 2 4 4 2 1 3 3 5 3 3 4
## 2 M008… 5 5 4 5 4 5 5 5 4 5 5 5
## 3 M009… 5 2 3 3 3 1 4 4 3 3 3 5
## 4 M023… 5 5 5 5 3 5 5 5 5 5 5 5
## 5 M016… 5 5 5 5 NA 4 5 5 5 5 5 5
## 6 M009… 4 4 4 4 4 3 4 4 4 4 4 5
## 7 M013… 5 4 3 NA 2 3 5 4 5 4 4 5
## 8 M012… 5 5 4 5 4 4 5 5 4 4 4 5
## 9 M014… 5 NA 3 5 4 2 5 5 5 4 4 5
## 10 M012… 5 4 4 5 3 4 5 4 3 4 4 5
## # ℹ 14 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Denmark", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9583333
## 2 eng_2 0.7916667
## 3 eng_3 0.8333333
## 4 eng_4 0.7083333
## 5 eng_5 0.7500000
## 6 lea_1 0.8333333
## 7 lea_2 0.8750000
## 8 lea_3 0.8695652
## 9 lea_4 0.8695652
# Calculate factor favorability score for engagement & leadership
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = "Denmark" # Add the country label
)
)
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.8083333 Denmark
## 2 lea 0.8618659 Denmark
favorability_score$Country <- "Denmark"
#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Country
## 10 eng 0.8083333 Denmark
## 1 eng_1 0.9583333 Denmark
## 2 eng_2 0.7916667 Denmark
## 3 eng_3 0.8333333 Denmark
## 4 eng_4 0.7083333 Denmark
## 5 eng_5 0.7500000 Denmark
## 11 lea 0.8618659 Denmark
## 6 lea_1 0.8333333 Denmark
## 7 lea_2 0.8750000 Denmark
## 8 lea_3 0.8695652 Denmark
## 9 lea_4 0.8695652 Denmark
#let's only keep engagement scores
engagement_favorable_scores <- combined_favorability_df[1:6, ]
print(engagement_favorable_scores)
## question_number favorability_score Country
## 10 eng 0.8083333 Denmark
## 1 eng_1 0.9583333 Denmark
## 2 eng_2 0.7916667 Denmark
## 3 eng_3 0.8333333 Denmark
## 4 eng_4 0.7083333 Denmark
## 5 eng_5 0.7500000 Denmark
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
library(scales)
# Create a ggplot that shows engagement factor favorable scores for Danish employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Danish Employees") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors) +
scale_y_continuous(labels = percent_format(accuracy = 1))
- e. France’s N = 136
df_2[df_2$country == "France" & !is.na(df_2$country), ]
## # A tibble: 136 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M014… 5 5 4 5 3 1 5 5 5 5 5 5
## 2 M006… 4 4 4 4 4 4 4 4 4 3 3 4
## 3 M004… 4 4 2 4 3 4 4 4 4 3 3 4
## 4 M011… 4 5 4 4 3 4 5 4 4 4 4 5
## 5 M002… 5 5 4 5 3 3 5 5 5 5 5 5
## 6 M009… 4 3 3 4 3 4 5 5 3 3 4 5
## 7 M024… 5 5 5 5 5 5 5 5 5 5 5 5
## 8 M005… 4 4 4 4 3 3 5 5 5 4 5 4
## 9 M018… 4 5 3 5 5 2 5 5 5 5 5 5
## 10 M006… 4 3 4 4 2 1 4 4 1 4 5 5
## # ℹ 126 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "France", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9338235
## 2 eng_2 0.8602941
## 3 eng_3 0.8750000
## 4 eng_4 0.7279412
## 5 eng_5 0.7720588
## 6 lea_1 0.9264706
## 7 lea_2 0.9044118
## 8 lea_3 0.9411765
## 9 lea_4 0.9485294
# Calculate factor favorability score for engagement & leadership
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = "France" # Add the country label
)
)
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.8338235 France
## 2 lea 0.9301471 France
favorability_score$Country <- "France"
#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Country
## 10 eng 0.8338235 France
## 1 eng_1 0.9338235 France
## 2 eng_2 0.8602941 France
## 3 eng_3 0.8750000 France
## 4 eng_4 0.7279412 France
## 5 eng_5 0.7720588 France
## 11 lea 0.9301471 France
## 6 lea_1 0.9264706 France
## 7 lea_2 0.9044118 France
## 8 lea_3 0.9411765 France
## 9 lea_4 0.9485294 France
#let's only keep engagement scores
engagement_favorable_scores <- combined_favorability_df[1:6, ]
print(engagement_favorable_scores)
## question_number favorability_score Country
## 10 eng 0.8338235 France
## 1 eng_1 0.9338235 France
## 2 eng_2 0.8602941 France
## 3 eng_3 0.8750000 France
## 4 eng_4 0.7279412 France
## 5 eng_5 0.7720588 France
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
library(scales)
# Create a ggplot that shows engagement factor favorable scores for French employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for French Employees") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors) +
scale_y_continuous(labels = percent_format(accuracy = 1))
- f. Germany’s N = 48
df_2[df_2$country == "Germany" & !is.na(df_2$country), ]
## # A tibble: 48 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M001… 5 5 4 3 3 4 5 5 4 2 3 5
## 2 M019… 4 4 4 5 5 5 5 5 4 3 5 4
## 3 M012… 4 4 3 3 3 3 3 4 4 3 4 3
## 4 M019… 3 4 4 4 2 4 4 4 3 3 3 4
## 5 M003… 4 4 5 4 4 4 5 4 5 4 4 5
## 6 M005… 5 5 3 4 2 2 3 4 4 2 3 4
## 7 M009… 4 4 4 5 3 3 4 5 4 5 4 5
## 8 M008… 5 4 3 4 3 3 5 5 4 2 4 5
## 9 M017… 5 5 5 5 3 5 5 5 5 4 4 5
## 10 M016… 5 3 3 3 4 2 4 3 3 2 3 5
## # ℹ 38 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Germany", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.8333333
## 2 eng_2 0.8333333
## 3 eng_3 0.7083333
## 4 eng_4 0.5106383
## 5 eng_5 0.5957447
## 6 lea_1 0.8750000
## 7 lea_2 0.7916667
## 8 lea_3 0.9375000
## 9 lea_4 0.8541667
# Calculate factor favorability score for engagement & leadership
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = "Germany" # Add the country label
)
)
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.6962766 Germany
## 2 lea 0.8645833 Germany
favorability_score$Country <- "Germany"
#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Country
## 10 eng 0.6962766 Germany
## 1 eng_1 0.8333333 Germany
## 2 eng_2 0.8333333 Germany
## 3 eng_3 0.7083333 Germany
## 4 eng_4 0.5106383 Germany
## 5 eng_5 0.5957447 Germany
## 11 lea 0.8645833 Germany
## 6 lea_1 0.8750000 Germany
## 7 lea_2 0.7916667 Germany
## 8 lea_3 0.9375000 Germany
## 9 lea_4 0.8541667 Germany
#let's only keep engagement scores
engagement_favorable_scores <- combined_favorability_df[1:6, ]
print(engagement_favorable_scores)
## question_number favorability_score Country
## 10 eng 0.6962766 Germany
## 1 eng_1 0.8333333 Germany
## 2 eng_2 0.8333333 Germany
## 3 eng_3 0.7083333 Germany
## 4 eng_4 0.5106383 Germany
## 5 eng_5 0.5957447 Germany
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
library(scales)
# Create a ggplot that shows engagement factor favorable scores for German employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for German Employees") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors) +
scale_y_continuous(labels = percent_format(accuracy = 1))
- g. India's N = 288
df_2[df_2$country == "India" & !is.na(df_2$country), ]
## # A tibble: 288 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M014… 5 5 4 5 5 5 5 5 4 3 3 5
## 2 M020… 5 4 3 5 3 4 5 5 3 4 4 5
## 3 M002… 5 5 5 5 5 5 5 5 5 4 4 5
## 4 M024… 5 5 4 3 5 5 5 5 3 2 3 5
## 5 M017… 5 5 5 5 5 5 5 5 5 5 5 5
## 6 M017… 3 4 3 4 3 5 5 5 5 4 5 2
## 7 M016… 5 5 4 5 5 5 5 5 5 3 3 5
## 8 M006… 3 3 3 3 4 4 5 5 3 5 5 5
## 9 M026… 4 4 4 4 4 4 5 5 5 5 4 4
## 10 M023… 5 5 4 5 5 5 5 5 5 5 5 5
## # ℹ 278 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "India", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9442509
## 2 eng_2 0.9166667
## 3 eng_3 0.8767606
## 4 eng_4 0.7152778
## 5 eng_5 0.7673611
## 6 lea_1 0.8982456
## 7 lea_2 0.9125874
## 8 lea_3 0.9581882
## 9 lea_4 0.9542254
# Calculate factor favorability score for engagement & leadership
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = "India" # Add the country label
)
)
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.8440634 India
## 2 lea 0.9308116 India
favorability_score$Country <- "India"
#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Country
## 10 eng 0.8440634 India
## 1 eng_1 0.9442509 India
## 2 eng_2 0.9166667 India
## 3 eng_3 0.8767606 India
## 4 eng_4 0.7152778 India
## 5 eng_5 0.7673611 India
## 11 lea 0.9308116 India
## 6 lea_1 0.8982456 India
## 7 lea_2 0.9125874 India
## 8 lea_3 0.9581882 India
## 9 lea_4 0.9542254 India
#let's only keep engagement scores
engagement_favorable_scores <- combined_favorability_df[1:6, ]
print(engagement_favorable_scores)
## question_number favorability_score Country
## 10 eng 0.8440634 India
## 1 eng_1 0.9442509 India
## 2 eng_2 0.9166667 India
## 3 eng_3 0.8767606 India
## 4 eng_4 0.7152778 India
## 5 eng_5 0.7673611 India
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
library(scales)
# Create a ggplot that shows engagement factor favorable scores for Indian employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Indian Employees") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors) +
scale_y_continuous(labels = percent_format(accuracy = 1))
- h. United Kingdom's N = 359
df_2[df_2$country == "United Kingdom" & !is.na(df_2$country), ]
## # A tibble: 359 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M003… 5 5 4 5 4 5 5 5 5 4 3 5
## 2 M014… 4 4 2 3 2 3 4 4 3 2 3 5
## 3 M020… 5 5 5 5 4 5 5 5 5 5 5 5
## 4 M023… 5 5 5 4 5 5 5 4 5 3 3 5
## 5 M019… 5 5 5 5 5 5 5 5 5 5 5 5
## 6 M010… 4 4 4 4 3 4 5 5 4 4 4 4
## 7 M000… 4 4 4 2 3 4 4 4 3 3 4 3
## 8 M013… 5 4 2 4 NA 4 5 5 4 4 4 5
## 9 M018… 3 3 2 2 1 3 4 3 3 2 3 3
## 10 M016… 5 5 4 5 3 5 5 5 5 4 5 5
## # ℹ 349 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "United Kingdom", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9385475
## 2 eng_2 0.8770950
## 3 eng_3 0.8328691
## 4 eng_4 0.6796657
## 5 eng_5 0.7214485
## 6 lea_1 0.8743017
## 7 lea_2 0.8659218
## 8 lea_3 0.9220056
## 9 lea_4 0.9359331
# Calculate factor favorability score for engagement & leadership
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = "United Kingdom" # Add the country label
)
)
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.8099251 United Kingdom
## 2 lea 0.8995405 United Kingdom
favorability_score$Country <- "United Kingdom"
#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Country
## 10 eng 0.8099251 United Kingdom
## 1 eng_1 0.9385475 United Kingdom
## 2 eng_2 0.8770950 United Kingdom
## 3 eng_3 0.8328691 United Kingdom
## 4 eng_4 0.6796657 United Kingdom
## 5 eng_5 0.7214485 United Kingdom
## 11 lea 0.8995405 United Kingdom
## 6 lea_1 0.8743017 United Kingdom
## 7 lea_2 0.8659218 United Kingdom
## 8 lea_3 0.9220056 United Kingdom
## 9 lea_4 0.9359331 United Kingdom
#let's only keep engagement scores
engagement_favorable_scores <- combined_favorability_df[1:6, ]
print(engagement_favorable_scores)
## question_number favorability_score Country
## 10 eng 0.8099251 United Kingdom
## 1 eng_1 0.9385475 United Kingdom
## 2 eng_2 0.8770950 United Kingdom
## 3 eng_3 0.8328691 United Kingdom
## 4 eng_4 0.6796657 United Kingdom
## 5 eng_5 0.7214485 United Kingdom
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
library(scales)
# Create a ggplot that shows engagement factor favorable scores for U.S. employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for U.K. Employees") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors) +
scale_y_continuous(labels = percent_format(accuracy = 1))
- i. United States' N = 1551
df_2[df_2$country == "United States" & !is.na(df_2$country), ]
## # A tibble: 1,551 × 30
## eeid ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M008… 5 5 5 5 5 5 5 5 5 5 5 5
## 2 M001… 5 5 4 5 5 5 5 5 5 4 3 5
## 3 M017… 4 4 3 4 4 4 4 4 3 4 4 4
## 4 M012… 4 3 4 3 3 3 4 3 4 3 3 4
## 5 M025… 5 5 4 4 5 5 5 4 5 4 5 5
## 6 M010… 5 5 5 5 4 5 5 5 4 5 3 5
## 7 M017… NA NA NA NA NA NA 4 NA NA NA NA NA
## 8 M018… 4 4 4 2 5 5 4 1 3 4 3 4
## 9 M013… 4 5 4 5 4 5 5 5 5 4 5 5
## 10 M004… 4 5 5 4 5 5 5 3 4 4 4 4
## # ℹ 1,541 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## # lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## # hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## # country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "United States", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
## question_number favorability_score
## 1 eng_1 0.9153200
## 2 eng_2 0.8312864
## 3 eng_3 0.7692806
## 4 eng_4 0.6735751
## 5 eng_5 0.7040155
## 6 lea_1 0.8000000
## 7 lea_2 0.8273616
## 8 lea_3 0.8654971
## 9 lea_4 0.8415584
# Calculate factor favorability score for engagement & leadership
## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
factor_favorable_score = numeric(),
stringsAsFactors = FALSE)
## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
# Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#use grep to find 'factor_abbre_" in favorability_score df
# Calculate the average favorable score for the factor
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE) # Add na.rm = TRUE to handle missing values
# Add each factor favorable score to the data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = "United States" # Add the country label
)
)
}
print(factor_favorable_score)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.7786955 United States
## 2 lea 0.8336043 United States
favorability_score$Country <- "United States"
#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
## question_number favorability_score Country
## 10 eng 0.7786955 United States
## 1 eng_1 0.9153200 United States
## 2 eng_2 0.8312864 United States
## 3 eng_3 0.7692806 United States
## 4 eng_4 0.6735751 United States
## 5 eng_5 0.7040155 United States
## 11 lea 0.8336043 United States
## 6 lea_1 0.8000000 United States
## 7 lea_2 0.8273616 United States
## 8 lea_3 0.8654971 United States
## 9 lea_4 0.8415584 United States
#let's only keep engagement scores
engagement_favorable_scores <- combined_favorability_df[1:6, ]
print(engagement_favorable_scores)
## question_number favorability_score Country
## 10 eng 0.7786955 United States
## 1 eng_1 0.9153200 United States
## 2 eng_2 0.8312864 United States
## 3 eng_3 0.7692806 United States
## 4 eng_4 0.6735751 United States
## 5 eng_5 0.7040155 United States
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")
# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC", # blue for the 'first' category
"other" = "#AEBAC2") # Grey for the 'other' category
library(scales)
# Create a ggplot that shows engagement factor favorable scores for U.S. employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for U.S. Employees") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14)
) +
geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) +
scale_fill_manual(values = highlight_colors) +
scale_y_continuous(labels = percent_format(accuracy = 1))
- j. create a visual that compares factor favorable scores for
engagement & leadership across countries
#Create an empty data frame to save the consolidated favorable_scores
consolidated_favorable_scores <- data.frame(
factor_abbreviation = character(),
factor_favorable_score = numeric(),
country = character(),
stringsAsFactors = FALSE
)
#list of countries
countries <- c("Australia", "Canada", "China", "Denmark", "France", "Germany", "India", "United Kingdom", "United States")
#Loop through each country and calculate factor favorable score
for (country in countries) {
#Filter data for the specific country
country_df <- df_2[df_2$country == country & !is.na(df_2$country), ]
#Calculate favorability_score using a custom function
favorability_score <- function_favorability(country_df[, c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number))
#Create a data frame for this country's factor_favorable_score (so a dataframe within a loop)
factor_favorable_score <- data.frame(
factor_abbreviation = character(),
factor_favorable_score = numeric(),
country = character(),
stringsAsFactors = FALSE
)
#Calculate the average favorable score for each factor
for (factor_abbr in factor_abbreviations) {
#Subset favorable scores for the factor
factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
#Calculate the average favorable score
avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)
#Append the calculated score to the country's data frame
factor_favorable_score <- rbind(
factor_favorable_score,
data.frame(
factor_abbreviation = factor_abbr,
factor_favorable_score = avg_favorable_score,
country = country # Add the country label
)
)
}
#Consolidate this country's data frame into the main consolidated data frame
consolidated_favorable_scores <- rbind(
consolidated_favorable_scores,
factor_favorable_score
)
}
print(consolidated_favorable_scores)
## factor_abbreviation factor_favorable_score country
## 1 eng 0.8277228 Australia
## 2 lea 0.9257426 Australia
## 3 eng 0.8380952 Canada
## 4 lea 0.8625932 Canada
## 5 eng 0.8066667 China
## 6 lea 0.8500000 China
## 7 eng 0.8083333 Denmark
## 8 lea 0.8618659 Denmark
## 9 eng 0.8338235 France
## 10 lea 0.9301471 France
## 11 eng 0.6962766 Germany
## 12 lea 0.8645833 Germany
## 13 eng 0.8440634 India
## 14 lea 0.9308116 India
## 15 eng 0.8099251 United Kingdom
## 16 lea 0.8995405 United Kingdom
## 17 eng 0.7786955 United States
## 18 lea 0.8336043 United States
#Sort the data frame by factor_favorable_score
consolidated_favorable_scores <- consolidated_favorable_scores %>%
arrange(desc(factor_favorable_score))
#Reorder the factor levels based on the sorted data frame
consolidated_favorable_scores$factor_abbreviation <- factor(
consolidated_favorable_scores$factor_abbreviation,
levels = unique(consolidated_favorable_scores$factor_abbreviation)
)
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(scales)
# Filter the data into two subsets: one for 'lea' (leadership factor) and one for 'eng' (engagement factor)
leadership_scores <- consolidated_favorable_scores %>%
filter(factor_abbreviation == "lea")
engagement_scores <- consolidated_favorable_scores %>%
filter(factor_abbreviation == "eng")
# Reorder the factor levels based on the sorted data frame
leadership_scores$country <- factor(
leadership_scores$country,
levels = unique(leadership_scores$country)
)
engagement_scores$country <- factor(
engagement_scores$country,
levels = unique(engagement_scores$country)
)
# Create ggplot for leadership scores
ggplot(leadership_scores, aes(x = country, y = factor_favorable_score * 100, fill = country)) +
geom_bar(stat = "identity", position = "dodge") + # Bars side-by-side
geom_text(
aes(label = paste0(round(factor_favorable_score * 100, 1), "%")),
position = position_dodge(width = 0.9),
vjust = -0.5 # Position the text above the bars
) +
labs(
x = "Country",
y = "Leadership Favorability Score (%)",
title = "Leadership Factor Favorable Scores by Country (%)"
) +
scale_y_continuous(labels = percent_format(scale = 1)) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
)
# Create ggplot for engagement scores
ggplot(engagement_scores, aes(x = country, y = factor_favorable_score * 100, fill = country)) +
geom_bar(stat = "identity", position = "dodge") + # Bars side-by-side
geom_text(
aes(label = paste0(round(factor_favorable_score * 100, 1), "%")),
position = position_dodge(width = 0.9),
vjust = -0.5 # Position the text above the bars
) +
labs(
x = "Country",
y = "Engagement Favorability Score (%)",
title = "Engagement Factor Favorable Scores by Country (%)"
) +
scale_y_continuous(labels = percent_format(scale = 1)) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
)